# Dataframes IV: working with Text Data

In [1]:
import pandas as pd

## This Module's Dataset

- **Dataset Name**: `chicago.csv`
- **Description**: A collection of public sector employees in the city of Chicago.
- **Columns**: Each row includes:
  - Employee's name
  - Position
  - Department
  - Salary


In [2]:
# Load and preprocess the DataFrame
chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
)

# Modify "Employee Annual Salary" column to include dollar sign
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)

# Display the first rows
chicago

# we see that: 
# - all values are inserted as strings
# - character columns are all in UPPERCASE
# - number column has a dollar sign added in front of value
# - name column puts last name before first name


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,
...,...,...,...,...
33178,"ZYLINSKA, KATARZYNA",POLICE OFFICER,POLICE,"$72,510.00"
33179,"ZYMANTAS, LAURA C",POLICE OFFICER,POLICE,"$48,078.00"
33180,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,"$90,024.00"
33181,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,"$93,354.00"


In [3]:
# let us begin with dropping rows with all NaN's values

chicago = chicago.dropna(how = "all")

In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33183 entries, 0 to 33182
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    33183 non-null  object
 1   Position Title          33183 non-null  object
 2   Department              33183 non-null  object
 3   Employee Annual Salary  25161 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [5]:
# check number of unique values in different columns

chicago.nunique()

Name                      32880
Position Title             1111
Department                   35
Employee Annual Salary     1000
dtype: int64

In [6]:
# departement only has 35 unique values in a dataset of 33.000: make into category to save memory

chicago["Department"] = chicago["Department"].astype("category")
chicago.info()
# we see substantial reduction in data size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33183 entries, 0 to 33182
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    33183 non-null  object  
 1   Position Title          33183 non-null  object  
 2   Department              33183 non-null  category
 3   Employee Annual Salary  25161 non-null  object  
dtypes: category(1), object(3)
memory usage: 811.6+ KB


## 1. Common String Methods

- A Series has a special `str` attribute that exposes an object with string methods.
- Access the `str` attribute, then invoke the string method on the nested object.
- Most method names will match their Python method equivalents (`upper`, `lower`, `title`, etc).



In [9]:
# import dataset

chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
    .dropna(how = "all")
)
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [10]:
chicago["Position Title"].str # first, call str attribute, which is a 'StringMethods' object

<pandas.core.strings.accessor.StringMethods at 0x1305e9b50>

In [16]:
chicago["Position Title"].str.lower() # then, via this str attribute, call regular string method on every specified value of Df/Series
chicago["Position Title"].str.title() # for example, put all position titles in lowercase, titlecase, ..
chicago["Position Title"].str.strip()
chicago["Position Title"].str.lstrip()
chicago["Position Title"].str.rstrip()
chicago["Position Title"].str.replace("OFFICER","AGENT")

0                                    SERGEANT
1        POLICE AGENT (ASSIGNED AS DETECTIVE)
2                    CHIEF CONTRACT EXPEDITER
3                           CIVIL ENGINEER IV
4                 TRAFFIC CONTROL AIDE-HOURLY
                         ...                 
33178                            POLICE AGENT
33179                            POLICE AGENT
33180                            POLICE AGENT
33181                            POLICE AGENT
33182                 CHIEF DATA BASE ANALYST
Name: Position Title, Length: 33183, dtype: object

In [13]:
chicago["Position Title"].str.len() # most methods are the same, but len() function becomes .len() method with .str attribute

0         8
1        38
2        24
3        17
4        27
         ..
33178    14
33179    14
33180    14
33181    14
33182    23
Name: Position Title, Length: 33183, dtype: int64

In [14]:
type(chicago["Position Title"].str.len())

pandas.core.series.Series

In [15]:
# we get back a new Series after calling .str.method() 
# this means: if we want to chain str.methods, we have to re-call .str attribute every time we call new method:
chicago["Position Title"].str.lower().str.len()

0         8
1        38
2        24
3        17
4        27
         ..
33178    14
33179    14
33180    14
33181    14
33182    23
Name: Position Title, Length: 33183, dtype: int64

## 2. Filtering with String Methods

- The `str.contains` method checks whether a substring exists anywhere in the string.
- The `str.startswith` method checks whether a substring exists at the start of the string.
- The `str.endswith` method checks whether a substring exists at the end of the string.


In [17]:
# import dataset

chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
    .dropna(how = "all")
)
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [18]:
# example 1: we want to filter all occurences of Position Title with "water" in them

# a) to start with: normalize data, so that you are sure all strings are written uniformly
chicago["Position Title"].str.lower()

# b) to filter, we first need Boolean Series
# for this, on the normalization, we can chain the .contains() method, which works similarly to the "in" in Python: anywhere in string
chicago["Position Title"].str.lower().str.contains("water")

0        False
1        False
2        False
3        False
4        False
         ...  
33178    False
33179    False
33180    False
33181    False
33182    False
Name: Position Title, Length: 33183, dtype: bool

In [19]:
# c) now we can subset the Df on this Boolean Series
water_workers = chicago["Position Title"].str.lower().str.contains("water")
chicago[water_workers]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
298,"AKINDE, SARAH",WATER CHEMIST II,WATER MGMNT,"$54,768.00"
579,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,
708,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,"$84,516.00"
724,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,"$111,456.00"
742,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,
...,...,...,...,...
31321,"WASHINGTON, JOSEPH",WATER CHEMIST IV,WATER MGMNT,"$100,776.00"
31771,"WHITLOCK, JAMAAR",WATER METER MACHINIST,WATER MGMNT,
32089,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,
32509,"WOODRIDGE, ROBERT L",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,


In [20]:
# example 2: find any position that starts with word "civil"
starts_with_civil = chicago["Position Title"].str.lower().str.startswith("civil")
chicago[starts_with_civil]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
20,"ABDULSATTAR, MUDHAR",CIVIL ENGINEER II,WATER MGMNT,"$65,448.00"
30,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
54,"ABUTALEB, AHMAD H",CIVIL ENGINEER II,WATER MGMNT,"$92,388.00"
162,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,"$110,064.00"
...,...,...,...,...
32705,"WYTANIEC, JEFFREY",CIVIL ENGINEER IV,AVIATION,"$83,688.00"
32735,"YANG, LUYANG",CIVIL ENGINEER V,TRANSPORTN,"$120,312.00"
32772,"YESUFU, STEPHANIE A",CIVIL ENGINEER III,TRANSPORTN,"$100,776.00"
32912,"ZAKE, JOSHUA S",CIVIL ENGINEER IV,TRANSPORTN,"$110,064.00"


In [22]:
# example 3: filter all positions that end with "IV"
ends_with_iv = chicago["Position Title"].str.lower().str.endswith("iv")
chicago[ends_with_iv]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
30,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
160,"ADAMS, SHERYLL A",LIBRARIAN IV,PUBLIC LIBRARY,"$100,776.00"
162,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,"$110,064.00"
180,"ADENI, MOHAMED K",ACCOUNTANT IV,FINANCE,"$100,776.00"
...,...,...,...,...
32912,"ZAKE, JOSHUA S",CIVIL ENGINEER IV,TRANSPORTN,"$110,064.00"
32985,"ZAVALA, FERNANDO",ACCOUNTANT IV,FINANCE,"$100,776.00"
32998,"ZAWADZKI, SHELLEY M",LIBRARIAN IV,PUBLIC LIBRARY,"$95,580.00"
33053,"ZHANG, ANNE",CIVIL ENGINEER IV,TRANSPORTN,"$91,464.00"


## 3. String Methods on Index and Columns

- Use the `index` and `columns` attributes to access the DataFrame index/column labels.
- These objects support string methods via their own `str` attribute.


In [24]:
# import dataset & set "Name" to index column

chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
    .dropna(how = "all")
    .set_index("Name")
    .sort_index()
)
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [27]:
# a) row labels

# str attribute & string methods are applicable in same way to index labels as to cell values
# we can apply the methods by calling .index attribute to access row labels
chicago.index.str.strip().str.title()

Index(['Aaron,  Jeffery M', 'Aaron,  Karina', 'Aaron,  Kimberlei R',
       'Abad Jr,  Vicente M', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J', 'Abbate,  Joseph L', 'Abbate,  Terry M',
       'Abbatemarco,  James J',
       ...
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygmunt,  Artur', 'Zygmunt,  Dawid', 'Zylinska,  Katarzyna',
       'Zymantas,  Laura C', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=33183)

In [29]:
# we can save this formatted index labels like we do with cell values by re-assignment
chicago.index = chicago.index.str.strip().str.title()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Jeffery M",SERGEANT,POLICE,"$101,442.00"
"Aaron, Karina",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
"Abascal, Reece E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [30]:
# b) column labels

# works exactly the same but now via .columns attribute to access columns labels

chicago.columns.str.upper()

Index(['POSITION TITLE', 'DEPARTMENT', 'EMPLOYEE ANNUAL SALARY'], dtype='object')

In [31]:
# again we can replace old column labels by re-assigning

chicago.columns = chicago.columns.str.upper()
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Jeffery M",SERGEANT,POLICE,"$101,442.00"
"Aaron, Karina",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
"Abascal, Reece E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


## 4. The split Method

- The `str.split` method splits a string by the occurrence of a delimiter. Pandas returns a Series of lists.
- Use the `str.get` method to access a nested list element by its index position.


In [32]:
# import dataset

chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
    .dropna(how = "all")
)
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [40]:
# Example 1: filter most common first word in job position titles

chicago["Position Title"].str.split() # split values for position title on spaces, which returns a list for every value/row
chicago["Position Title"].str.split().str.get(0) # & within each of these lists select first word with str.get() method (str.get() method applies to str as well as list)
chicago["Position Title"].str.split().str.get(0).value_counts() # then count unique values

Position Title
POLICE             11612
FIREFIGHTER-EMT     1724
SERGEANT            1203
POOL                1055
CROSSING             811
                   ...  
CURATOR                1
COMMANDING             1
ASBESTOS               1
PURCHASING             1
DECK                   1
Name: count, Length: 327, dtype: int64

In [51]:
# Example 2: find most common first name among employees

chicago["Name"].str.title() # first normalize name values
chicago["Name"].str.title().str.split(", ") # we see that first & last names are split by ", " so we split and per row get back a list
chicago["Name"].str.title().str.split(", ").str.get(1) # now we see that first name is on 2nd position of each list, so we use str.get() method
chicago["Name"].str.title().str.split(", ").str.get(1).iloc[0] # if we now print out the selected value of the first row, we see that there is a leading white space
chicago["Name"].str.title().str.split(", ").str.get(1).str.strip() # we can remove any white space str.strip() method to strip both leading & eventual trailing white spaces
chicago["Name"].str.title().str.split(", ").str.get(1).str.strip().str.split().str.get(0) # however, some rows still have an Initial letter for the middle name after a white space and the first name
# we therefore split again on whitespace (if no whitespace because no letter for middle name, nothing happens) & select only first word in resulting lists
chicago["Name"].str.title().str.split(", ").str.get(1).str.strip().str.split().str.get(0).value_counts() # finally, call method to count occurences of unique values

# REMARK: there are other ways to do this with less code, example with Regex

Name
Michael    1151
John        856
James       645
Robert      587
Joseph      540
           ... 
Ernika        1
Renard        1
Sharlyn       1
Siedah        1
Dawid         1
Name: count, Length: 5497, dtype: int64

## 5. The `expand` and `n` Parameters of the split Method

- The `expand` parameter returns a DataFrame instead of a Series of lists.
- The `n` parameter limits the number of splits.


In [52]:
# import dataset

chicago = (
    pd.read_csv("chicago.csv")
    .drop(columns=["Full or Part-Time", "Salary or Hourly", "Typical Hours", "Hourly Rate"])
    .rename(columns={"Job Titles": "Position Title", "Annual Salary": "Employee Annual Salary"})
    .dropna(how = "all")
)
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].apply(
   lambda x: f"${x:,.2f}" if pd.notnull(x) else x
)
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00"
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00"
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00"
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00"
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,


In [54]:
# Example 1: let's split Name column in First & Second Name

chicago["Name"].str.split(",",expand = True) # instead of returning result of split() as lists, expand allows to return as Df

Unnamed: 0,0,1
0,AARON,JEFFERY M
1,AARON,KARINA
2,AARON,KIMBERLEI R
3,ABAD JR,VICENTE M
4,ABASCAL,REECE E
...,...,...
33178,ZYLINSKA,KATARZYNA
33179,ZYMANTAS,LAURA C
33180,ZYMANTAS,MARK E
33181,ZYRKOWSKI,CARLO E


In [57]:
# we can add these 2 new columns to original Df by supplying List in assignment statement

chicago[["Last Name","First Name"]] = chicago["Name"].str.split(",",expand = True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00",AARON,JEFFERY M
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00",AARON,KARINA
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00",AARON,KIMBERLEI R
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00",ABAD JR,VICENTE M
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,,ABASCAL,REECE E


In [59]:
# Example 2: let's split Position Title by spaces

chicago["Position Title"].str.split(" ",expand = True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,SERGEANT,,,,,,,,
1,POLICE,OFFICER,(ASSIGNED,AS,DETECTIVE),,,,
2,CHIEF,CONTRACT,EXPEDITER,,,,,,
3,CIVIL,ENGINEER,IV,,,,,,
4,TRAFFIC,CONTROL,AIDE-HOURLY,,,,,,
...,...,...,...,...,...,...,...,...,...
33178,POLICE,OFFICER,,,,,,,
33179,POLICE,OFFICER,,,,,,,
33180,POLICE,OFFICER,,,,,,,
33181,POLICE,OFFICER,,,,,,,


In [60]:
# this does not look beautiful! There are a couple of rows with 9 words... and so also shorter rows get a lot of NaN values
# to remedy this: we can specify the number of times Pandas should perform a split

chicago["Position Title"].str.split(" ",expand = True, n=1) # in this example only 1 split is performed

Unnamed: 0,0,1
0,SERGEANT,
1,POLICE,OFFICER (ASSIGNED AS DETECTIVE)
2,CHIEF,CONTRACT EXPEDITER
3,CIVIL,ENGINEER IV
4,TRAFFIC,CONTROL AIDE-HOURLY
...,...,...
33178,POLICE,OFFICER
33179,POLICE,OFFICER
33180,POLICE,OFFICER
33181,POLICE,OFFICER


In [63]:
# again we can add new columns to our Df by specifying their names in a list in the assignment statement

chicago[["Primary Title", "Secondary Title"]] = chicago["Position Title"].str.split(" ",expand = True, n=1)
chicago

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name,Primary Title,Secondary Title
0,"AARON, JEFFERY M",SERGEANT,POLICE,"$101,442.00",AARON,JEFFERY M,SERGEANT,
1,"AARON, KARINA",POLICE OFFICER (ASSIGNED AS DETECTIVE),POLICE,"$94,122.00",AARON,KARINA,POLICE,OFFICER (ASSIGNED AS DETECTIVE)
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,"$101,592.00",AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,"$110,064.00",ABAD JR,VICENTE M,CIVIL,ENGINEER IV
4,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,,ABASCAL,REECE E,TRAFFIC,CONTROL AIDE-HOURLY
...,...,...,...,...,...,...,...,...
33178,"ZYLINSKA, KATARZYNA",POLICE OFFICER,POLICE,"$72,510.00",ZYLINSKA,KATARZYNA,POLICE,OFFICER
33179,"ZYMANTAS, LAURA C",POLICE OFFICER,POLICE,"$48,078.00",ZYMANTAS,LAURA C,POLICE,OFFICER
33180,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,"$90,024.00",ZYMANTAS,MARK E,POLICE,OFFICER
33181,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,"$93,354.00",ZYRKOWSKI,CARLO E,POLICE,OFFICER
