In [1]:
import pandas as pd

Pandas includes many string methods to clean data for easier processing

In [3]:
# Publicly available information of employees of the city of Chicago
# Problems:
#   Title and Department are all caps
#   First/Last names are stored in single cell
#   Last row is all NaN
#   Salary is a string, not numeric, because of the dollar sign
chicago = pd.read_csv('datasets/chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [6]:
# Optimize memory usage a bit to start with
chicago['Department'].nunique()

35

In [9]:
# Convert to Category
chicago['Department'] = chicago['Department'].astype('category')
chicago['Position Title'] = chicago['Position Title'].astype('category')

In [10]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null category
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(2), object(2)
memory usage: 603.8+ KB


# Section 6; Part 86
Common string methods - `lower()`, `upper()`, `title()`, `len()`

 - `lower` - Converts all characters in string to lower case
 - `upper` - Converts all characters in string to upper case
 - `title` - Capitilizes first letter of each word
 - `len` - Returns the length of a string
 
When calling these on a pandas object, need to prepend `.str.` to call

Example:

    chicago['Name'].str.lower()

In [11]:
chicago = pd.read_csv('datasets/chicago.csv')
chicago['Department'] = chicago['Department'].astype('category')

In [14]:
# Clean names by only capitilizing first letter of each part of name and title
chicago['Name'] = chicago['Name'].str.title()
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,WATER MGMNT,$106836.00


# Section 6; Part 87
The `.str.replace()` method

In [22]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [18]:
chicago['Department'].head(5)

0         WATER MGMNT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4         WATER MGMNT
Name: Department, dtype: category
Categories (35, object): [ADMIN HEARNG, ANIMAL CONTRL, AVIATION, BOARD OF ELECTION, ..., STREETS & SAN, TRANSPORTN, TREASURER, WATER MGMNT]

In [23]:
# Replace "MGMNT" with "Management"
chicago['Department'] = chicago['Department'].str.replace("MGMNT", "MANAGEMENT")
chicago['Department'].head(5)

0    WATER MANAGEMENT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4    WATER MANAGEMENT
Name: Department, dtype: object

In [26]:
# Convert annual salary to float
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace("$", "").astype('float')

In [27]:
chicago['Employee Annual Salary'].sum()

2571506375.3600698

In [28]:
chicago['Employee Annual Salary'].mean()

80204.178633899

In [29]:
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

# Section 6; Part 88
Filtering with string methods

In [30]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')

In [32]:
# Extract any rows where position title contains "Water"
# Standardize the strings for later searching (WATER != water != Water)
pos_mask = chicago['Position Title'].str.lower().str.contains("water")
chicago[pos_mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00


In [33]:
# Extract rows where position title start with "Water"
pos_mask = chicago['Position Title'].str.lower().str.startswith("water")
chicago[pos_mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00
2586,"BOYCE, ADNER L",WATER CHEMIST II,WATER MGMNT,$82044.00
2745,"BRANDYS, DANIEL",WATER CHEMIST II,WATER MGMNT,$53172.00
3143,"BROWN, SHARON L",WATER RATE TAKER,WATER MGMNT,$82728.00


In [34]:
# Extract rows where position title ends with "ist"
pos_mask = chicago['Position Title'].str.lower().str.endswith("ist")
chicago[pos_mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
1022,"ARTEAGA, PAUL",MACHINIST,TRANSPORTN,$94328.00
1163,"AYALA JR, JUAN",FIELD SANITATION SPECIALIST,STREETS & SAN,$78948.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
1558,"BARRETT, BARBARA J",TECHNICAL TRAINING SPECIALIST,POLICE,$94200.00
1869,"BELTRAN, MAURICIO",PROCUREMENT SPECIALIST,PROCUREMENT,$79596.00


# Section 6; Part 89
More string methods - `strip`, `lstrip`, `rstrip`

 - `lstrip` - Removes whitespace from beginning of string
 - `rstrip` - Removes whitespace from end of string
 - `strip` - Removes whitespace from both beginning and ending of string

Remove white space surrounding strings

In [35]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')

In [38]:
chicago['Name'] = chicago['Name'].str.strip()
chicago['Position Title'] = chicago['Position Title'].str.strip()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


# Section 6; Part 90
String methods on Index and Column labels

In [40]:
chicago = pd.read_csv('datasets/chicago.csv', index_col="Name").dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [42]:
# List of all of our index labels - in this case, names
chicago.index

Index([u'AARON,  ELVIA J', u'AARON,  JEFFERY M', u'AARON,  KARINA',
       u'AARON,  KIMBERLEI R', u'ABAD JR,  VICENTE M', u'ABARCA,  ANABEL',
       u'ABARCA,  EMMANUEL', u'ABASCAL,  REECE E', u'ABBASI,  CHRISTOPHER',
       u'ABBATACOLA,  ROBERT J',
       ...
       u'ZWIT,  JEFFREY J', u'ZWOLFER,  MATTHEW W', u'ZYCH,  MATEUSZ',
       u'ZYDEK,  BRYAN', u'ZYGADLO,  JOHN P', u'ZYGADLO,  MICHAEL J',
       u'ZYGOWICZ,  PETER J', u'ZYMANTAS,  MARK E', u'ZYRKOWSKI,  CARLO E',
       u'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name=u'Name', length=32062)

In [45]:
# String methods can operate like previous lesson on .index
chicago.index = chicago.index.str.strip().str.title()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [46]:
# Same process works with .columns
chicago.columns = chicago.columns.str.upper()
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


# Section 6; Part 91
Split strings by characters with `.str.split()`

In [53]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')

In [55]:
# Find most common last name
#   .get(0) - Take first item in a list, which is [LASTNAME, FIRSTNAME]
chicago['Name'].str.split(",").str.get(0).str.title().value_counts()

Williams          293
Johnson           244
Smith             241
Brown             185
Jones             183
Rodriguez         171
Jackson           136
Garcia            130
Davis             127
Hernandez         110
Martinez          108
Lopez             106
Gonzalez          104
Perez             100
Wilson             94
Rivera             90
Thomas             89
Anderson           82
Torres             81
Murphy             80
Robinson           79
Moore              78
Harris             76
Sanchez            76
Miller             75
Lewis              74
Taylor             73
Martin             72
White              66
Clark              66
                 ... 
Baylian             1
Jaglinski           1
Duenas              1
Lowinger            1
Singto              1
Gilleran            1
Chao Jr             1
Gniady              1
Warren Stanley      1
Dickhut             1
Brouder             1
Hozzian             1
Mccalpin            1
Decorrevont         1
Colvin    

In [58]:
# Find most common first word in position titles
chicago['Position Title'].str.split(" ").str.get(0).value_counts()

POLICE                   10856
FIREFIGHTER-EMT           1509
SERGEANT                  1186
POOL                       918
FIREFIGHTER                810
CROSSING                   775
MOTOR                      721
SANITATION                 715
PARAMEDIC                  641
ASST                       606
FIRE                       512
TRAFFIC                    512
SENIOR                     470
CONSTRUCTION               452
LIEUTENANT-EMT             394
ADMINISTRATIVE             375
LIBRARY                    365
LIBRARIAN                  335
LIEUTENANT                 332
OPERATING                  324
ELECTRICAL                 313
AVIATION                   309
FIREFIGHTER/PARAMEDIC      259
GENERAL                    257
STAFF                      250
CLERK                      242
FOREMAN                    237
HOISTING                   214
DEPUTY                     213
MACHINIST                  210
                         ...  
INSPECTOR                    1
VOLUNTEE

# Section 6; Part 92
More practice with splits

In [59]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')

In [61]:
# Get most common first name
#   Difficulties: After spliting on comma, some names have a middle initial others do not

# Get "first name" position with .get(1)
names = chicago['Name'].str.split(",").str.get(1)
names.head()

0          ELVIA J
1        JEFFERY M
2           KARINA
3      KIMBERLEI R
4        VICENTE M
Name: Name, dtype: object

In [62]:
# Can't just .split on this because some do not have a middle initial and we have a space before first names
names.str.split(" ")

0            [, , ELVIA, J]
1          [, , JEFFERY, M]
2              [, , KARINA]
3        [, , KIMBERLEI, R]
4          [, , VICENTE, M]
5              [, , ANABEL]
6            [, , EMMANUEL]
7            [, , REECE, E]
8         [, , CHRISTOPHER]
9           [, , ROBERT, J]
10           [, , JAMES, J]
11           [, , TERRY, M]
12           [, , BETTY, L]
13          [, , LYNISE, M]
14         [, , WILLIAM, J]
15               [, , ZAID]
16          [, , ABDALMAHD]
17            [, , AREF, R]
18               [, , AZIZ]
19                [, , ALI]
20        [, , MUHAMMAD, A]
21          [, , DANIEL, N]
22              [, , KEVIN]
23         [, , LAKENYA, N]
24          [, , RASHAD, J]
25             [, , MUDHAR]
26              [, , TAHIR]
27          [, , ABUUBAIDA]
28           [, , JASON, V]
29            [, , EARL, S]
                ...        
32032           [, , JAMES]
32033         [, , JUAN, M]
32034           [, , OSCAR]
32035          [, , RONALD]
32036          [, , 

In [65]:
# Now, if a middle initial exists, it will be in the second element. First name will always be in .get(0)
names.str.strip().str.split(" ").str.get(0).value_counts()

MICHAEL        1153
JOHN            899
JAMES           676
ROBERT          622
JOSEPH          537
DAVID           506
THOMAS          490
DANIEL          472
WILLIAM         397
ANTHONY         385
KEVIN           331
BRIAN           320
RICHARD         314
MARK            310
PATRICK         300
MATTHEW         247
TIMOTHY         243
JOSE            224
STEVEN          220
CHRISTOPHER     217
PAUL            205
EDWARD          205
KENNETH         201
ERIC            170
CHARLES         163
JEFFREY         152
GREGORY         150
GEORGE          149
RONALD          147
MARY            147
               ... 
EDIS              1
EZEKIEL           1
CAREY             1
CORDY             1
GODFREY           1
MEGHANN           1
TERONDA           1
MARVALYNN         1
DAPHINE           1
GERVAISE          1
TAMMRA            1
LUCIAN            1
REENA             1
COMONIECK         1
SELLES            1
DARWYN            1
ROBINETTE         1
SHELISA           1
SHIMIKA           1


# Section 6; Part 93
The expand and `n` Parameters of the `.str.split()` method

In [66]:
chicago = pd.read_csv('datasets/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')

In [69]:
# expand - Default = False, If set to True returns a dataframe
chicago['Name'].str.split(",", expand=True)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
5,ABARCA,ANABEL
6,ABARCA,EMMANUEL
7,ABASCAL,REECE E
8,ABBASI,CHRISTOPHER
9,ABBATACOLA,ROBERT J


In [70]:
# Assign columns back to dataframe
chicago[['First Name', 'Last Name']] = chicago['Name'].str.split(",", expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [72]:
chicago['Position Title'].str.split(" ", expand=True)
# Get a "weird" dataframe because there are a different number of spaces in other position titles
# Rows with shorter titles have 'None' assigned to other positions

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
5,ASST,TO,THE,ALDERMAN,,,,,
6,GENERAL,LABORER,-,DSS,,,,,
7,TRAFFIC,CONTROL,AIDE-HOURLY,,,,,,
8,STAFF,ASST,TO,THE,ALDERMAN,,,,
9,ELECTRICAL,MECHANIC,,,,,,,


In [76]:
# Limit split to only two operations (First word, Everything else)
chicago[['First title word', 'Remaining words']] = chicago['Position Title'].str.split(" ", expand=True, n=1)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First title word,Remaining words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
