In [0]:
import pandas as pd

# Get the data

In [0]:
file_location = "/FileStore/tables/baby_names.csv"

The standard `pd.read_csv()` works only locally, so it doesn't work on DBFS. Therefore we will read the data with `spark.read` (which we will explore in subsequent chapters) and transform it into pandas DataFrame using `toPandas()`.

In [0]:
df = spark.read.format("csv") \
  .option("inferSchema", True) \
  .option("header", True) \
  .option("sep", ',') \
  .load(file_location)

In [0]:
display(df)

In [0]:
names = df.toPandas()
names.head()

# Preliminaries

Dropping the `soundex` column

In [0]:
names.drop('soundex', axis=1, inplace=True)
names.sample(5)

Setting the `MultiIndex` to `year` and `name`

In [0]:
names.set_index(['year', 'name'], inplace=True)
names.sample(5)

Splitting the data to `boys` and `girls` Series objects

In [0]:
boys = names.loc[names['sex']=='boy'].drop('sex', axis=1)
boys.sample(5)

In [0]:
girls = names.loc[names['sex']=='girl'].drop('sex', axis=1)
girls.sample(5)

# Basic EDA

What is the sum of all `prop`'s per year?

In [0]:
boys.groupby(level='year').sum().plot(title='Boys names total proportion');
girls.groupby(level='year').sum().plot(title='Girls names total proportion');

How does the trend of a specific name look through the years?

In [0]:
NAME = 'Elvis'
boys.loc[(slice(None), NAME)].plot();

In [0]:
NAME = 'Margaret'
girls.loc[(slice(None), NAME)].plot();

How many names are used for both boys and girls throughout the years?

In [0]:
boys.join(girls, lsuffix='_boys', rsuffix='_girls', how='inner').index.get_level_values(1).nunique()