# Part 6 working with data

In [2]:
import pandas as pd
import numpy as np

## issues with the chicago_food_inspections data
* whitespaces surrounding the name
* inconsistency in names: uppercase and lowercase

In [3]:
inspections = pd.read_csv('/home/diego/Documents/Data/chicago_food_inspections.csv')
inspections['Name'].values

array([' MARRIOT MARQUIS CHICAGO   ', ' JETS PIZZA ', '   ROOM 1520 ',
       ..., ' Cafe 608 ', "  mr.daniel's ", '   TEMPO CAFE '],
      dtype=object)

In [4]:
# Modifying the column, first applying the method, and then replacing the column. For the two columns
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

In [5]:
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


## How can we know all different values a column can contain?

With the `unique()` method

In [6]:
# In the Risk column we have 5 different possible values in all the column
inspections.Risk.unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

I just will remove the nan values

From original 153,810 rows, dropping nan we have 153,744 rows

In [7]:
inspections = inspections.dropna(subset=['Risk'])
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


Let's check now the values that Risk column contains

In [8]:
inspections.Risk.unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

Now setting 'All' to 'Risk 4 (Extreme)' to keep consistency in the format

In [9]:
# inspections.Risk = inspections.Risk.str.replace('All', 'Risk 4 (Extreme)')  not proper
inspections = inspections.replace('All', 'Risk 4 (Extreme)')  # this is a method of the df

In [10]:
inspections.Risk.unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

In [17]:
# we can do slicing with the df method slice() or []
inspections.Risk.str[5:6]  # Slicing of the numbers

0         1
1         2
2         3
3         1
4         1
         ..
153805    1
153806    2
153807    1
153808    1
153809    1
Name: Risk, Length: 153744, dtype: object

In [20]:
# Slicing of the words Low, Medium, High, Extreme
inspections.Risk.str[8:-1]

0           High
1         Medium
2            Low
3           High
4           High
           ...  
153805      High
153806    Medium
153807      High
153808      High
153809      High
Name: Risk, Length: 153744, dtype: object

Finding the word 'pizza' in the series' values

In [25]:
has_pizza = inspections.Name.str.lower().str.contains('pizza')
inspections[has_pizza]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Risk 2 (Medium)
19,NANCY'S HOME OF STUFFED PIZZA,Risk 1 (High)
27,"NARY'S GRILL & PIZZA ,INC.",Risk 1 (High)
29,NARYS GRILL & PIZZA,Risk 1 (High)
68,COLUTAS PIZZA,Risk 1 (High)
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,Risk 1 (High)
153764,COCHIAROS PIZZA #2,Risk 1 (High)
153772,FERNANDO'S MEXICAN GRILL & PIZZA,Risk 1 (High)
153788,REGGIO'S PIZZA EXPRESS,Risk 1 (High)


What if we want to be more precise in our targeting, perhaps extracting all establishments beginning with the string "tacos"?

In [28]:
start_tacos = inspections.Name.str.lower().str.startswith('tacos')
inspections[start_tacos]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,Risk 1 (High)
556,TACOS EL TIO 2 INC.,Risk 1 (High)
675,TACOS DON GABINO,Risk 1 (High)
958,TACOS EL TIO 2 INC.,Risk 1 (High)
1036,TACOS EL TIO 2 INC.,Risk 1 (High)
...,...,...
143587,TACOS DE LUNA,Risk 1 (High)
144026,TACOS GARCIA,Risk 1 (High)
146174,Tacos Place's 1,Risk 1 (High)
147810,TACOS MARIO'S LIMITED,Risk 1 (High)


The complementary, endswith

In [30]:
end_tacos = inspections.Name.str.lower().str.endswith('tacos')
inspections[end_tacos]

Unnamed: 0,Name,Risk
382,LAZO'S TACOS,Risk 1 (High)
569,LAZO'S TACOS,Risk 1 (High)
2652,FLYING TACOS,Risk 3 (Low)
3250,JONY'S TACOS,Risk 1 (High)
3812,PACO'S TACOS,Risk 1 (High)
...,...,...
151121,REYES TACOS,Risk 1 (High)
151318,EL MACHO TACOS,Risk 1 (High)
151801,EL MACHO TACOS,Risk 1 (High)
153087,RAYMOND'S TACOS,Risk 1 (High)


## splitting strings

In [32]:
customers = pd.read_csv('/home/diego/Documents/Data/customers.csv')
customers

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."
...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex..."
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ..."
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg..."
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916"


let's separate the name and last name from the Name column. using split() python method

The `pat`: pattern that indicates where split

The `n`: How many splits 

The `expand`: If False, return a series of list values containing the splits. if True return a dataframe with splits 
 

In [41]:
customers.Name.str.split(pat=' ', n=1, expand=True)

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora
...,...,...
9956,Dana,Browning
9957,Amanda,Anderson
9958,Eric,Davis
9959,Taylor,Hernandez


Now adding the recent split to the customers df, following the syntax

In [42]:
customers[['First Name', 'Last Name']] = customers.Name.str.split(pat=' ', n=1, expand=True)

In [43]:
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


Deleting the Name column

In [45]:
customers = customers.drop(columns=['Name'])

In [46]:
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez
