# Pandas More Utility Functions

A demonstration of advanced `pandas` syntax to accompany Lecture 4.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

## Dataset: California baby names

In today's lecture, we'll work with the `babynames` dataset, which contains information about the names of infants born in California.

The cell below pulls census data from a government website and then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!

In [None]:
import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "D:\Workbook assignments\babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


### Exercises
We want to obtain the first three baby names with `count > 250`.

1.Code this using head()

2.Code this using loc

3.Code this using iloc

4.Code this using []


In [None]:
# Answer Here
count_baby_names1 = babynames[babynames['Sex'] == 'F']
count_250_1 = count_baby_names1[count_baby_names1['Count'] > 250]
count_250_1.head(3)


Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [None]:
# Answer Here
count_baby_names_2 = babynames.loc[(babynames['Sex'] == 'F') & (babynames['Count'] > 250)]
count_baby_names_2.head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [None]:
# Answer Here
filtered_indices = babynames.loc[(babynames['Sex'] == 'F') & (babynames['Count'] > 250)].index
count_baby_names_3 = babynames.iloc[filtered_indices]
count_baby_names_3.head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [None]:
# Answer Here
count_baby_names_4 = babynames[(babynames['Sex'] == 'F') & (babynames['Count'] > 250)]
count_baby_names_4[:3]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


### `.isin` for Selection based on a list, array, or `Series`

In [None]:
# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability
babynames[(babynames["Name"] == "Bella") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")]


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


In [None]:
# A more concise method to achieve the above: .isin
#Answer Here
name_list = ['Bella', 'Alex', 'Narges', 'Lisa']
concise_method = babynames[babynames['Name'].isin(name_list)]
concise_method


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


### `.str` Functions for Defining a Condition

In [None]:
# What if we only want names that start with "J"?
#Answer Here
name_start_j = babynames[babynames['Name'].str.startswith('J')]
name_start_j

Unnamed: 0,State,Sex,Year,Name,Count
16,CA,F,1910,Josephine,66
44,CA,F,1910,Jean,35
46,CA,F,1910,Jessie,32
59,CA,F,1910,Julia,28
66,CA,F,1910,Juanita,25
...,...,...,...,...,...
413714,CA,M,2023,Jj,5
413715,CA,M,2023,Johnathon,5
413716,CA,M,2023,Jorden,5
413717,CA,M,2023,Jozef,5


# Custom Sort

In [None]:
# Sort a Series Containing Names
sorted_by_name = babynames.sort_values(by="Name", ascending=True)
sorted_by_name

Unnamed: 0,State,Sex,Year,Name,Count
387660,CA,M,2014,Aadan,5
369654,CA,M,2008,Aadan,7
372774,CA,M,2009,Aadan,6
401876,CA,M,2019,Aadarsh,6
388799,CA,M,2015,Aaden,34
...,...,...,...,...,...
232190,CA,F,2020,Zyrah,5
220708,CA,F,2017,Zyrah,6
217445,CA,F,2016,Zyrah,5
197542,CA,F,2011,Zyrah,5


In [None]:
# Sort a DataFrame – there are lots of Michaels in California
micheal_names = babynames[babynames['Name'].str.contains('Michael')]
micheal_names

Unnamed: 0,State,Sex,Year,Name,Count
16200,CA,F,1938,Michael,7
16853,CA,F,1939,Michael,9
17559,CA,F,1940,Michael,12
18484,CA,F,1941,Michael,8
19060,CA,F,1942,Michael,25
...,...,...,...,...,...
408087,CA,M,2021,Michaelangelo,5
408242,CA,M,2022,Michael,1046
410648,CA,M,2022,Michaelangelo,6
411137,CA,M,2023,Michael,943


### Approach 1: Create a temporary column

In [None]:
# Create a Series of the length of each name
length = babynames['Name'].str.len()
# Add the Series as a new column to the DataFrame
babynames["name_lengths"] = length
# Sort the DataFrame by the new column
babynames_sorted = babynames.sort_values(by="name_lengths")
babynames_sorted

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
83016,CA,F,1979,Ji,5,2
331174,CA,M,1993,Vu,5,2
298821,CA,M,1978,Al,13,2
277555,CA,M,1962,Ty,55,2
404824,CA,M,2020,Jj,6,2
...,...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8,15
325562,CA,M,1991,Franciscojavier,6,15
316193,CA,M,1987,Franciscojavier,5,15
317627,CA,M,1988,Franciscojavier,10,15


In [None]:
# drop new column
babynames = babynames_sorted.drop("name_lengths", axis="columns")
babynames

Unnamed: 0,State,Sex,Year,Name,Count
83016,CA,F,1979,Ji,5
331174,CA,M,1993,Vu,5
298821,CA,M,1978,Al,13
277555,CA,M,1962,Ty,55
404824,CA,M,2020,Jj,6
...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8
325562,CA,M,1991,Franciscojavier,6
316193,CA,M,1987,Franciscojavier,5
317627,CA,M,1988,Franciscojavier,10


### Approach 2: Sorting using the `key` argument

---



In [32]:
# Answer Here
babynames.sort_values("Name", key=lambda x:x.str.len(), ascending=False)

Unnamed: 0,State,Sex,Year,Name,Count
102512,CA,F,1986,Mariadelosangel,5
340954,CA,M,1997,Franciscojavier,5
348093,CA,M,2000,Franciscojavier,6
325441,CA,M,1991,Ryanchristopher,7
343125,CA,M,1998,Franciscojavier,6
...,...,...,...,...,...
107301,CA,F,1988,An,13
253015,CA,M,1931,Ed,17
352911,CA,M,2002,An,7
258584,CA,M,1941,Ed,24


### Approach 3: Sorting Using the `map` Function

We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".

In [38]:

# Define a function to count occurrences of 'dr' and 'ea'
def dr_ea_count(string):
 return string.count('dr') + string.count('ea')
# Apply the function to each name in the "Name" column and add as a new column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)
# Sort the DataFrame by the new column in descending order
babynames = babynames.sort_values(by = "dr_ea_count", ascending=False)
# Display the top rows
babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count,dr_ea_count
311780,CA,M,1985,Deandrea,6,3
108738,CA,F,1988,Deandrea,5,3
101982,CA,F,1986,Deandrea,6,3
115965,CA,F,1990,Deandrea,5,3
131037,CA,F,1994,Leandrea,5,3


In [39]:
# Drop the `dr_ea_count` column
babynames = babynames.drop("dr_ea_count", axis="columns")
babynames

Unnamed: 0,State,Sex,Year,Name,Count
311780,CA,M,1985,Deandrea,6
108738,CA,F,1988,Deandrea,5
101982,CA,F,1986,Deandrea,6
115965,CA,F,1990,Deandrea,5
131037,CA,F,1994,Leandrea,5
...,...,...,...,...,...
190289,CA,F,2010,Kaitlynn,43
333305,CA,M,1994,Manpreet,6
167938,CA,F,2004,Bayleigh,5
300795,CA,M,1979,Nicklaus,7


## Grouping

Group rows that share a common feature, then aggregate data across the group.

In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).

<img src="images/groupby.png" width="800"/>

In [41]:
# DataFrame with baby girl names only
# Answer Here
f_baby_names = babynames[babynames['Sex'] == 'F']
f_baby_names

#Groupby similar features like year and apply aggregate
# Answer Here
f_baby_names = babynames[["Year", "Count"]].groupby("Year").agg(sum)

# Sort by Count
# Sort by Count in descending order
# Answer Here
f_baby_names.sort_values(by="Count", ascending=False)

  f_baby_names = babynames[["Year", "Count"]].groupby("Year").agg(sum)


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1990,552669
1991,549339
1992,541091
1993,524993
1989,512613
...,...
1914,26926
1913,22094
1912,17946
1911,9983


In [42]:
# print first 10 entries
f_baby_names.head(10)

Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1910,9163
1911,9983
1912,17946
1913,22094
1914,26926
1915,35835
1916,37501
1917,39916
1918,44692
1919,45119


In [43]:
# the total baby count in each year
# Answer Here
total_baby_count_per_year = babynames.groupby('Year')['Count'].sum()
total_baby_count_per_year.head()

Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1910,9163
1911,9983
1912,17946
1913,22094
1914,26926


There are many different aggregation functions we can use, all of which are useful in different applications.

In [45]:
# What is the earliest year in which each name appeared?
# Answer Here
earliest_year_per_name = babynames.groupby('Name')['Year'].min()
earliest_year_per_name.head()

Unnamed: 0_level_0,Year
Name,Unnamed: 1_level_1
Aadan,2008
Aadarsh,2019
Aaden,2007
Aadhav,2014
Aadhini,2022


In [46]:
# What is the largest single-year count of each name?
# Answer Here
largest_count_per_name = babynames.groupby('Name')['Count'].max()
largest_count_per_name.head()

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadan,7
Aadarsh,6
Aaden,158
Aadhav,8
Aadhini,6


In [47]:
#Can you find the most popular baby name in the state of California (CA) for each year? use idxmax function.
#Provide a list of years along with the corresponding most popular names."
result = babynames.groupby("Year")['Count'].idxmax()
#Answer Here
most_popular_names = babynames.loc[result, ['Year', 'Name', 'Count']]
most_popular_names.head()

Unnamed: 0,Year,Name,Count
0,1910,Mary,295
233,1911,Mary,390
484,1912,Mary,534
243717,1913,John,614
1120,1914,Mary,773


## Case Study: Name "Popularity"

In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering `babynames` to only include names corresponding to sex "F".

In [None]:
#Answer Here
f_baby_names = babynames[babynames['Sex'] == 'F']
f_baby_names

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
243185,CA,F,2023,Zeppelin,5
243186,CA,F,2023,Zhamira,5
243187,CA,F,2023,Zina,5
243188,CA,F,2023,Zooey,5


In [None]:
# We sort the data by year
f_baby_names_sorted = f_baby_names.sort_values(by='Year')
f_baby_names_sorted

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
148,CA,F,1910,Merle,9
149,CA,F,1910,Rosalie,9
150,CA,F,1910,Rosie,9
151,CA,F,1910,Teresa,9
...,...,...,...,...,...
240783,CA,F,2023,Zayna,22
240784,CA,F,2023,Aashvi,21
240785,CA,F,2023,Aida,21
240759,CA,F,2023,Eimy,22


To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.

In [None]:
# We'll talk about how to generate plots in a later lecture
fig = px.line(f_baby_names_sorted[f_baby_names_sorted["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18,
                  autosize=False,
                 width=1000,
                  height=400)

We'll need a mathematical definition for the change in popularity of a name.

Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2022 (the most recent year for which we have data) divided by the largest count of this name in *any* year.

A demo calculation for Jennifer:

In [None]:
# Find the highest Jennifer 'count'
highest_count = max(f_baby_names_sorted[f_baby_names_sorted["Name"] == "Jennifer"]["Count"])
highest_count

6065

In [None]:
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
curr_count_jennifer = f_baby_names_sorted[f_baby_names_sorted["Name"] == "Jennifer"].iloc[-1]
curr_count_jennifer

Unnamed: 0,239956
State,CA
Sex,F
Year,2023
Name,Jennifer
Count,88


We can also write a function that produces the `ratio_to_peak`for a given `Series`. This will allow us to use `.groupby` to speed up our computation for all names in the dataset.

In [None]:
# define the function for RTP
"""
Compute the RTP for a Series containing the counts per year for a single name
"""
def ratio_to_peak(series):
  return series.iloc[-1] / max(series)

In [None]:
# Construct a Series containing our Jennifer count data
count_ser_jenn = f_baby_names_sorted[f_baby_names_sorted["Name"] == "Jennifer"]["Count"]
# Then, find the RTP using the function define above
ratio_to_peak(count_ser_jenn)

0.014509480626545754

Now, let's use `.groupby` to compute the RTPs for *all* names in the dataset.

You may see a warning message when running the cell below. As discussed in lecture, `pandas` can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, `.groupby` will drop any columns that cannot be aggregated.

In [None]:
# Results in a TypeError
rtp_table = f_baby_names_sorted.groupby("Name").agg(ratio_to_peak)
rtp_table

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
# Find the RTP fro all names at once using groupby as describe in lec slides
rtp_table = f_baby_names_sorted.groupby("Name")[["Year","Count"]].agg(ratio_to_peak)
rtp_table

Unnamed: 0_level_0,Year,Count
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadhini,1.0,1.000000
Aadhira,1.0,0.500000
Aadhya,1.0,0.760000
Aadya,1.0,0.758621
Aahana,1.0,0.269231
...,...,...
Zyanya,1.0,0.800000
Zyla,1.0,1.000000
Zylah,1.0,1.000000
Zyra,1.0,1.000000


To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using `.agg`.

In [None]:
# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_baby_names_sorted.groupby("Name")[["Count"]].agg(ratio_to_peak)
rtp_table

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.760000
Aadya,0.758621
Aahana,0.269231
...,...
Zyanya,0.800000
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


In [None]:
# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns={"Count":"Count_RTP"})
rtp_table

Unnamed: 0_level_0,Count_RTP
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.760000
Aadya,0.758621
Aahana,0.269231
...,...
Zyanya,0.800000
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


In [None]:
# What name has fallen the most in popularity?
#rtp_table
rtp_table_sorted = rtp_table.sort_values(by="Count_RTP", ascending=True)
name_fallen_most = rtp_table_sorted.head(1)
name_fallen_most


Unnamed: 0_level_0,Count_RTP
Name,Unnamed: 1_level_1
Debra,0.001512


We can visualize the decrease in the popularity of the name "?:"

In [None]:
def plot_name(*names):
    fig = px.line(f_baby_names_sorted[f_baby_names_sorted["Name"].isin(names)],
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18,
                  autosize=False,
                  width=1000,
                  height=400)
    return fig
# pass the name into plot_name
plot_name("Debra")

In [None]:
# Find the 10 names that have decreased the most in popularity
# Answer Here
rtp_table_sorted = rtp_table.sort_values(by="Count_RTP", ascending=True)
top10 = rtp_table_sorted.head(10).index
top10

Index(['Debra', 'Debbie', 'Tammy', 'Pamela', 'Cheryl', 'Michele', 'Susan',
       'Terri', 'Shannon', 'Kathy'],
      dtype='object', name='Name')

In [None]:
plot_name(*top10)

For fun, try plotting your name or your friends' names.