# Lecture 5

 Fall 2023

A demonstration of advanced `pandas` syntax to accompany Lecture 5.

In [27]:
import numpy as np
import pandas as pd

In [28]:
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']

babynames = pd.read_csv('H:/Machine Learning/babynamesbystate/STATE.CA.TXT', header=None, names=field_names)

data = babynames.copy()

data.head(2)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239


## Pivot Tables

### `Groupby` with multiple columns

We want to build a table showing the total number of babies born of each sex in each year. One way is to `groupby` using both columns of interest:

In [29]:
# Find total count of baby names for both female and Male for each year

s = data.groupby(["Year","Sex"])["Count"].sum()

s.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Year,Sex,Unnamed: 2_level_1
1910,F,5950
1910,M,3213
1911,F,6602
1911,M,3381
1912,F,9804
...,...,...
2020,M,189119
2021,F,173913
2021,M,188669
2022,F,172454


### `pivot_table`

In [30]:
# Find total count of baby names for both female and Male for each year using Pivot table

piv_table = pd.pivot_table(data, index='Year', columns='Sex', values='Count', aggfunc='sum')
piv_table

Sex,F,M
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1910,5950,3213
1911,6602,3381
1912,9804,8142
1913,11860,10234
1914,13815,13111
...,...,...
2018,189208,206228
2019,184228,202768
2020,173763,189119
2021,173913,188669


![pivot_picture.png](attachment:pivot_picture.png)

### `pivot_table` with Multiple values

In [31]:
# Form a pivot table as describr in Lecture Slides

multi_pivot_vals = data.pivot_table(index = "Year", columns = "Sex", 
                                    values = ["Count", "Name"],aggfunc = np.max)

multi_pivot_vals


  multi_pivot_vals = data.pivot_table(index = "Year", columns = "Sex",


Unnamed: 0_level_0,Count,Count,Name,Name
Sex,F,M,F,M
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1910,295,237,Yvonne,William
1911,390,214,Zelma,Willis
1912,534,501,Yvonne,Woodrow
1913,584,614,Zelma,Yoshio
1914,773,769,Zelma,Yoshio
...,...,...,...,...
2018,2751,2572,Zyra,Zyon
2019,2608,2681,Zyra,Zyon
2020,2353,2630,Zyrah,Zyon
2021,2402,2613,Zyra,Zyrus


---

## Join Tables

What if we want to know the popularity of presidential candidates' first names in California in 2022? What can we do?

In [32]:
elections = pd.read_csv("H:\Machine Learning\Excel files\elections 1.csv")
elections.head()

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
2,1828,Andrew Jackson,Democratic,642806,win,56.203927
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073
4,1832,Andrew Jackson,Democratic,702735,win,54.574789


In [33]:
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']

baby = pd.read_csv('H:/Machine Learning/babynamesbystate/STATE.CA.TXT', header=None, names=field_names)

baby.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [34]:
# Collect baby names for 2022
baby_names = baby[baby["Year"] == 2022]
baby_names.head()

Unnamed: 0,State,Sex,Year,Name,Count
235835,CA,F,2022,Olivia,2178
235836,CA,F,2022,Emma,2080
235837,CA,F,2022,Camila,2046
235838,CA,F,2022,Mia,1882
235839,CA,F,2022,Sophia,1762


In [35]:
# Use split the candidate names in elections dataframe
elections["First Name"] = elections["Candidate"].str.split().str[0]

`join` in pandas

In [36]:
#Merge both elections and babynames and report your analysis
merged = pd.merge(left = elections, right = baby, left_on = "First Name", right_on = "Name")
merged.head(3)

Unnamed: 0,Year_x,Candidate,Party,Popular vote,Result,%,First Name,State,Sex,Year_y,Name,Count
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122,Andrew,CA,F,1963,Andrew,5
1,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122,Andrew,CA,F,1968,Andrew,7
2,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122,Andrew,CA,F,1970,Andrew,5


In [37]:
# Sort using Count
merged.sort_values(by="Count",inplace=True)

merged

Unnamed: 0,Year_x,Candidate,Party,Popular vote,Result,%,First Name,State,Sex,Year_y,Name,Count
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122,Andrew,CA,F,1963,Andrew,5
8596,1856,Millard Fillmore,American,873053,loss,21.554001,Millard,CA,M,1951,Millard,5
8588,1856,Millard Fillmore,American,873053,loss,21.554001,Millard,CA,M,1938,Millard,5
8586,1856,Millard Fillmore,American,873053,loss,21.554001,Millard,CA,M,1936,Millard,5
8584,1856,Millard Fillmore,American,873053,loss,21.554001,Millard,CA,M,1931,Millard,5
...,...,...,...,...,...,...,...,...,...,...,...,...
18344,1988,Michael Dukakis,Democratic,41809074,loss,45.770691,Michael,CA,M,1956,Michael,8258
18527,2004,Michael Badnarik,Libertarian,397265,loss,0.325108,Michael,CA,M,1956,Michael,8258
18345,1988,Michael Dukakis,Democratic,41809074,loss,45.770691,Michael,CA,M,1957,Michael,8260
18711,2004,Michael Peroutka,Constitution,143630,loss,0.117542,Michael,CA,M,1957,Michael,8260
