In [1]:
import pandas as pd

In [3]:
chicago = pd.read_csv("chicago.csv")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [5]:
# Employee Annual Salary is not being read as number because of "$", pandas has no cconcept of currency
# Must get rid of $ before converting to floats
# First we will optimize data

chicago["Department"].nunique()

35

In [6]:
chicago["Department"].count()

32062

In [7]:
#Small ratio, perfect to store as category
chicago["Department"] = chicago["Department"].astype("category")

In [8]:
chicago.info()
# Big reduction in memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


# Common String Methods - .lower(), .upper(), .title() and .len()



In [9]:
chicago = pd.read_csv("chicago.csv")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [11]:
# Converts all characters in String
"Hello World".lower()
"hello world".upper()

'HELLO WORLD'

In [12]:
# Capitalizes every word
"hello world".title()

'Hello World'

In [13]:
#Spaces count
len("Hello World")

11

In [16]:
# must prefix with .str for pandas to work
chicago["Name"].str.lower()
# Can chain methods
chicago["Name"].str.lower().str.upper()

0                 AARON,  ELVIA J
1               AARON,  JEFFERY M
2                  AARON,  KARINA
3             AARON,  KIMBERLEI R
4             ABAD JR,  VICENTE M
5                 ABARCA,  ANABEL
6               ABARCA,  EMMANUEL
7               ABASCAL,  REECE E
8            ABBASI,  CHRISTOPHER
9           ABBATACOLA,  ROBERT J
10          ABBATEMARCO,  JAMES J
11               ABBATE,  TERRY M
12               ABBOTT,  BETTY L
13              ABBOTT,  LYNISE M
14         ABBRUZZESE,  WILLIAM J
15                ABDALLAH,  ZAID
16          ABDELHADI,  ABDALMAHD
17            ABDELLATIF,  AREF R
18             ABDELMAJEID,  AZIZ
19            ABDOLLAHZADEH,  ALI
20       ABDUL-KARIM,  MUHAMMAD A
21            ABDULLAH,  DANIEL N
22               ABDULLAH,  KEVIN
23           ABDULLAH,  LAKENYA N
24            ABDULLAH,  RASHAD J
25           ABDULSATTAR,  MUDHAR
26           ABDUL-SHAKUR,  TAHIR
27         ABDULWAHAB,  ABUUBAIDA
28              ABEJERO,  JASON V
29        ABER

In [18]:
chicago["Name"].str.title()
chicago["Position Title"].str.title()

0                                     Water Rate Taker
1                                       Police Officer
2                                       Police Officer
3                             Chief Contract Expediter
4                                    Civil Engineer Iv
5                                 Asst To The Alderman
6                                General Laborer - Dss
7                          Traffic Control Aide-Hourly
8                           Staff Asst To The Alderman
9                                  Electrical Mechanic
10                                   Fire Engineer-Emt
11                                      Police Officer
12                                  Foster Grandparent
13                                           Clerk Iii
14                              Investigator - Ipra Ii
15                                      Police Officer
16                                      Police Officer
17       Firefighter (Per Arbitrators Award)-Paramedic
18        

In [19]:
#Overwrite series
chicago["Position Title"] = chicago["Position Title"].str.title()

In [20]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [21]:
# Python built in len() counts the number of values
len(chicago["Department"])

32063

In [22]:
# pandas len() method counts all characters in each value and indexes them
chicago["Department"].str.len()

0        11.0
1         6.0
2         6.0
3        16.0
4        11.0
5        12.0
6        13.0
7         4.0
8        12.0
9         8.0
10        4.0
11        6.0
12       16.0
13        6.0
14        4.0
15        6.0
16        6.0
17        4.0
18        6.0
19        4.0
20       11.0
21        4.0
22        4.0
23        4.0
24       16.0
25       11.0
26       13.0
27       16.0
28        6.0
29        4.0
         ... 
32033     6.0
32034     6.0
32035     6.0
32036    13.0
32037     4.0
32038    11.0
32039     4.0
32040     6.0
32041     4.0
32042    16.0
32043    13.0
32044     6.0
32045     4.0
32046     7.0
32047     6.0
32048     3.0
32049     4.0
32050    11.0
32051     8.0
32052     6.0
32053     4.0
32054     6.0
32055     6.0
32056    16.0
32057    16.0
32058     6.0
32059     6.0
32060     6.0
32061     4.0
32062     NaN
Name: Department, Length: 32063, dtype: float64

# The .str.replace() Method

In [25]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [23]:
"Hello World".replace("l", "!")

'He!!o Wor!d'

In [28]:
chicago["Department"].head(1)

0    WATER MGMNT
Name: Department, dtype: category
Categories (35, object): [ADMIN HEARNG, ANIMAL CONTRL, AVIATION, BOARD OF ELECTION, ..., STREETS & SAN, TRANSPORTN, TREASURER, WATER MGMNT]

In [29]:
chicago["Department"].str.replace("MGMNT", "MANAGEMENT")
# to replace assign to variable
chicago["Department"] = chicago["Department"].str.replace("MGMNT", "MANAGEMENT")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [31]:
# Changing salary string to float
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float)

In [32]:
chicago["Employee Annual Salary"].sum()

2571506375.36

In [33]:
chicago["Employee Annual Salary"].mean()

80204.178633899

In [34]:
chicago["Employee Annual Salary"].std()

25098.329867510587

In [35]:
chicago["Employee Annual Salary"].nlargest(5)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
Name: Employee Annual Salary, dtype: float64

# Filtering with String Methods

In [36]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [40]:
# Normalizing data first is good practice, due to case sensitive searches
# Below will work as a boolean filter due to "contains() method"
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00


In [42]:
# .contains() will look for the substring in any string, i.e 'water' will match "water management"
chicago[chicago["Position Title"].str.lower().str.startswith("water")]


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00
2586,"BOYCE, ADNER L",WATER CHEMIST II,WATER MGMNT,$82044.00
2745,"BRANDYS, DANIEL",WATER CHEMIST II,WATER MGMNT,$53172.00
3143,"BROWN, SHARON L",WATER RATE TAKER,WATER MGMNT,$82728.00


In [47]:
mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00


# More string methods - .strip(), .lstrip() and rstrip()

In [48]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [51]:
#Remove whitespace from the left
"            Hello World  ".lstrip()

'Hello World  '

In [52]:
#Remove whitespace from the right
"            Hello World  ".rstrip()

'            Hello World'

In [53]:
#Remove from both sides
"            Hello World  ".strip()

'Hello World'

In [60]:
chicago["Name"].str.rstrip()
# Can be chained
chicago["Name"].str.rstrip().str.lstrip()
# To make it permanent assign
chicago["Name"] = chicago["Name"].str.rstrip().str.lstrip()

In [62]:
chicago["Position Title"] = chicago["Position Title"].str.strip()

# String Methods on Index and Columns

In [78]:
# Set name column as index
chicago = pd.read_csv("chicago.csv", index_col = "Name").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [79]:
chicago.index = chicago.index.str.strip().str.title()

In [80]:
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [81]:
chicago.columns = chicago.columns.str.upper()

In [82]:
chicago.tail(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


# Split Strings by Characters with .str.split() Method

In [84]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [83]:
# Will look for first empty space and split the string into a list
"My name is Patrick".split(" ")

['My', 'name', 'is', 'Patrick']

In [90]:
# Split each name by common character i.e ",". each value will now be a list
chicago["Name"].str.split(",")
# Pull the element from each list with specified index with .get() method
chicago["Name"].str.split(",").str.get(0).str.title().value_counts()

Williams        293
Johnson         244
Smith           241
Brown           185
Jones           183
Rodriguez       171
Jackson         136
Garcia          130
Davis           127
Hernandez       110
Martinez        108
Lopez           106
Gonzalez        104
Perez           100
Wilson           94
Rivera           90
Thomas           89
Anderson         82
Torres           81
Murphy           80
Robinson         79
Moore            78
Harris           76
Sanchez          76
Miller           75
Lewis            74
Taylor           73
Martin           72
Clark            66
White            66
               ... 
Muzquiz           1
Mishler           1
Zebrauskas        1
Hill- Butts       1
Carriere Iii      1
Kwan              1
Grassmuck         1
Ye                1
Schubert          1
Cutrone           1
Raya Oehman       1
Ocarroll          1
Marinopoulos      1
Pachnik           1
Prskalo           1
Pressel           1
Plotke Jr         1
Whisler           1
Presti            1


In [92]:
chicago["Position Title"].str.split(" ").head(8)

0               [WATER, RATE, TAKER]
1                  [POLICE, OFFICER]
2                  [POLICE, OFFICER]
3       [CHIEF, CONTRACT, EXPEDITER]
4              [CIVIL, ENGINEER, IV]
5          [ASST, TO, THE, ALDERMAN]
6         [GENERAL, LABORER, -, DSS]
7    [TRAFFIC, CONTROL, AIDE-HOURLY]
Name: Position Title, dtype: object

In [95]:
#Most common Position title that begins with a particular word
chicago["Position Title"].str.split(" ").str.get(0).value_counts().head(5)

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
Name: Position Title, dtype: int64

# More Practice with Splits

In [96]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [97]:
chicago["Name"].str.split(",").str.get(0).value_counts().head(3)

WILLIAMS    293
JOHNSON     244
SMITH       241
Name: Name, dtype: int64

In [98]:
# Get most common first name, change index position to 1
chicago["Name"].str.split(",").str.get(1).value_counts().head(3)

  MICHAEL J    270
  MICHAEL      165
  MICHAEL A    158
Name: Name, dtype: int64

In [101]:
# Remove middle initial and compare most popular first names
s = chicago["Name"].str.split(",").str.get(1)
# Remove whitespace from left to properly cleanse data
s = s.str.lstrip()
s.str.split(" ").str.get(0).value_counts().head(3)

MICHAEL    1153
JOHN        899
JAMES       676
Name: Name, dtype: int64

In [103]:
# Can be chained for same result
chicago["Name"].str.split(",").str.get(1).str.lstrip().str.split(" ").str.get(0).value_counts().head(3)


MICHAEL    1153
JOHN        899
JAMES       676
Name: Name, dtype: int64

# The expand and n Parameters of the str.split() Method

In [105]:
chicago = pd.read_csv("chicago.csv").dropna(how = "all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [110]:
# Set expand parameter to true and it will return a df rather than a set of lists
chicago[["Last Name", "First Name"]] = chicago["Name"].str.split(",", expand = True)

In [111]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,ELVIA J,AARON
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,JEFFERY M,AARON
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,KARINA,AARON


In [114]:
chicago["Position Title"].str.split(" ", expand = True).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
5,ASST,TO,THE,ALDERMAN,,,,,
6,GENERAL,LABORER,-,DSS,,,,,
7,TRAFFIC,CONTROL,AIDE-HOURLY,,,,,,
8,STAFF,ASST,TO,THE,ALDERMAN,,,,
9,ELECTRICAL,MECHANIC,,,,,,,


In [118]:
# Above shows number of columns generated by whitespace in lists, limit splits with n parameter
# Add them as columns at the end of the df by passing list of column names
chicago[["First Title Word", "Remaining Words"]] = chicago["Position Title"].str.split(" ", expand = True, n = 1).head(10)

In [119]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,ELVIA J,AARON,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,JEFFERY M,AARON,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,KARINA,AARON,POLICE,OFFICER
