In [12]:
import pandas as pd
import numpy as np

import altair as alt 
# work-around to let Altair handle larger data sets
alt.data_transformers.enable('json')

import ipywidgets as widgets

## Loading and pre-processing the data set

First we'll load our data set and pre-process it in order to aggregate it and put it in the necessary format.

In [3]:
# Load data
df = pd.read_csv("../data/dpt2020.csv", 
                 sep=';',
                 header=0,
                 names=['gender', 'name', 'year', 'dpt', 'births'],
                 converters={
                          'name': str.title,
                      })

# Cleanning
df = df.loc[(df['name'].str.len() > 1)
                & (df['year'] != 'XXXX')
                & ~df['name'].str.startswith('_')].reset_index(drop=True)
df['name'] = df['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

df['year'] = df['year'].astype(np.int64)
df['dpt'] = df['dpt'].astype(np.int64)
df.head()

Unnamed: 0,gender,name,year,dpt,births
0,1,Aadil,1983,84,3
1,1,Aadil,1992,92,3
2,1,Aahil,2016,95,3
3,1,Aaron,1962,75,3
4,1,Aaron,1976,75,3


We can now aggregate our data set by year and by name to get rid of the detail per departement. We'll also add the percentage of births for each name. This percentatge is the number of births for a given name and a given year divided by the total number of births that year.

In [4]:
df_agg = df[['name', 'year', 'births']].groupby(['name', 'year']).sum().sort_values(by=['year', 'births', 'name'], ascending=[False, False, True]).reset_index()

total_year = {}
for key, group in df_agg.groupby('year'):
    total_year[key] = group['births'].sum()
    
df_agg['percentage'] = df_agg.apply(lambda row: row[2]/total_year[row[1]], axis=1)

# we can export our aggregated data set
#df.to_csv("../data/baby_years.csv", sep=';' ,index=False)

df_agg.head()

Unnamed: 0,name,year,births,percentage
0,Leo,2020,4659,0.009564
1,Gabriel,2020,4421,0.009075
2,Raphael,2020,4111,0.008439
3,Jade,2020,3815,0.007831
4,Louise,2020,3807,0.007815


## Loading the aggregated dataset

We can load our aggregated data set. We also need to create a dictionaries to allow us to easilly create filters for our visualization.

In [13]:
# loading the aggregated dataframe
df_agg=pd.read_csv("../data/baby_years.csv", sep=';')
df_agg.shape

(249113, 4)

Define our dictionaries to apply filters.

In [17]:
# ranking names on all years to filter on all years
d_percent = {}
d_births = {}
for key, gp in df_agg.groupby('name'):
    d_percent[key] = gp.percentage.sum()
    d_births[key] = gp.births.sum()

# sort the dict by value then key => if Same value, sort by name!
d_fpercent = dict(sorted(d_percent.items(), key=lambda x: x[1], reverse=True)) 
d_births = dict(sorted(d_births.items(), key=lambda x: x[1], reverse=True))


## First graph
In this section we'll plot the graphs as we imagined it in our sketches.

In [19]:
# Widgets
selection=widgets.Dropdown(
    options=[None, 'Top 5', 'Top 10', 'Bottom 5', 'Bottom 10'],
    value= None,
    description='Display :',
)
year=widgets.Dropdown(
    options=[None,'All'] + sorted(list(df_agg.year.unique())),
    value= None,
    description='Year:',
)
name = widgets.Dropdown(
    options=[None] + sorted(list(df_agg.name.unique())),
    value=None,
    description='Name:',
)

# Drawing function
def make_chart(noms):  
    
    donnee=df_agg[df_agg.name.isin(noms)] # data
    selector = alt.selection_multi(fields=['name']) # filter

    # chart
    chart=alt.Chart(donnee,
             width=800,
             height=400,
             title='Percentage of each name per year').mark_line(point=(donnee.year.nunique() <= 10)).encode(
                                x='year:Q',
                                y='percentage:Q',
                                tooltip=['name','year','births','percentage'],
                                #color = alt.Color('name:N')
                                color=alt.condition(selector, 'name:N', alt.value('lightgray'))
                            ).add_selection(selector)

    
    # Creation de l'histogram
    histo = alt.Chart(donnee, width=800, height=200,
                      title='Number of births for each name per year').mark_bar().encode(
        x='year:Q',
        y='births:Q',
        tooltip=['name','year','births','percentage'],
        color='name:N',
        opacity=alt.value(0.7)
    ).transform_filter(selector)

    return chart & histo


def visualisation(selection,annee,nom):
    # get value form selectors
    if not (selection==annee==nom): # All selector are not None=>
        if selection:
            select = selection.split(' ')
            select[1] = int(select[1])
        else:
            select = [None, None]

        # update dataframe according to filter
        if nom is None:
            if annee !='All' and select[0]=='Top':
                l_noms=df_agg[df_agg.year==annee].nlargest(select[1],'births', keep='all').name.values
            if annee !='All' and select[0]=='Bottom':
                l_noms=df_agg[df_agg.year==annee].nsmallest(select[1],'births', keep='all').name.values
            if annee !='All' and selection is None:
                l_noms=df_agg[df_agg.year==annee].name.values
            if annee =='All' and select[0]=='Top':
                l_noms=list(d_percent.keys())[:select[1]]
            if annee =='All' and select[0]=='Bottom':
                l_noms=list(d_percent.keys())[-select[1]:]
            if annee =='All' and select[0] is None:
                l_noms = list(df_agg.name.unique())
        else:
            l_noms = [nom]
        
        if (selection == None and annee == None) or len(l_noms) == 0:
            print("No data to display, select a Top/Bottom and a year or a name")
        else:
            display(make_chart(l_noms))
    
    return None

#Running graph
box = widgets.HBox([selection, year, name]) 
out = widgets.interactive_output(visualisation, {'selection':selection, 'annee':year, 'nom':name});
display(box,out)

HBox(children=(Dropdown(description='Display :', options=(None, 'Top 5', 'Top 10', 'Bottom 5', 'Bottom 10'), v…

Output()

## Second graph
In this section we'll plot the graphs previously made and we'll add a graph with the total number of birth per year and the number of distinct names given each year (diversity), as we noticed that the tota number of births per name seemed to have been decreasing during the last past years. We suspected that this was due to the diversification of the names given (there are more and more distinct names, so each individual name is less and less represented). We added a graph to confirm our hypothis.

In [17]:
# Widgets
selection=widgets.Dropdown(
    options=[None, 'Top 5', 'Top 10', 'Bottom 5', 'Bottom 10'],
    value= None,
    description='Display :',
)
year=widgets.Dropdown(
    options=[None,'All'] + sorted(list(df_agg.year.unique())),
    value= None,
    description='Year:',
)
name = widgets.Dropdown(
    options=[None] + sorted(list(df_agg.name.unique())),
    value=None,
    description='Name:',
)

# Drawing function
def make_chart(noms):   
    # data selection
    donnee=df_agg[df_agg.name.isin(noms)] # data
    selector = alt.selection_multi(fields=['name']) # filter
    
    # chart
    chart=alt.Chart(donnee,
             width=800,
             height=400,
            title='Percentage of each name per year').mark_line(point=(donnee.year.nunique() <= 10)).encode(
                                x='year:Q',
                                y='percentage:Q',
                                tooltip=['name:N','year:Q','births:Q','percentage:Q'],
                                #color = alt.Color('name:N')
                                color=alt.condition(selector, 'name:N', alt.value('lightgray'))
                            ).add_selection(selector)
    
    # bar plot of name diversity per year
    diversity = alt.Chart(df_agg, width=800, height=200,
                          title='Names diversity and natality per year'
                         ).mark_bar(color='darkblue').encode(x='year:Q',
                            y=alt.Y('nbNameYear:Q', 
                                    axis=alt.Axis(title='Name diversity per year', 
                                                  titleColor='darkblue')),
                             tooltip=['year',"nbNameYear:Q"]
                            ).transform_aggregate(nbNameYear='count(name)',
                                                  groupby=["year"]
                                                    )
    # lineplot of annual birth
    annualbirth = alt.Chart(df_agg, width=800, height=200
                           ).mark_line(color='red').encode(x='year:Q',
                                                            y=alt.Y('birthsYear:Q', 
                                                                    axis=alt.Axis(title='sum births per year', 
                                                                                  titleColor='red')),
                                                           tooltip=['year',"birthsYear:Q"]
                                                            ).transform_aggregate(
                                                                                birthsYear='sum(births)',
                                                                                groupby=["year"]
                                                                                )
    # create a layer to link bar plot and lineplot with 2 y scales independant
    down = alt.layer(diversity, annualbirth).resolve_scale(y = 'independent')
    
    # Creation de l'histogram
    histo = alt.Chart(donnee, width=800, height=200, title='Number of births for each name per year').mark_bar().encode(
        x='year:Q',
        y='births:Q',
        color='name:N',
        tooltip=['name','year','births','percentage'],
        opacity=alt.value(0.7)
    ).transform_filter(selector)

    return chart & histo & down



def visualisation(selection,annee,nom):
    # get value form selectors
    if not (selection==annee==nom): # All selector are not None
        if selection:
            select = selection.split(' ')
            select[1] = int(select[1])
        else:
            select = [None, None]

        # update dataframe according to filter
        if nom is None:
            if annee !='All' and select[0]=='Top':
                l_noms=df_agg[df_agg.year==annee].nlargest(select[1],'births', keep='all').name.values
            if annee !='All' and select[0]=='Bottom':
                l_noms=df_agg[df_agg.year==annee].nsmallest(select[1],'births', keep='all').name.values
            if annee !='All' and selection is None:
                l_noms=df_agg[df_agg.year==annee].name.values
            if annee =='All' and select[0]=='Top':
                l_noms=list(d_percent.keys())[:select[1]]
            if annee =='All' and select[0]=='Bottom':
                l_noms=list(d_percent.keys())[-select[1]:]
            if annee =='All' and select[0] is None:
                l_noms = list(df_agg.name.unique())
        else:
            l_noms = [nom]

        if (selection == None and annee == None) or len(l_noms) == 0:
            print("No data to display, select a Top/Bottom and a year or a name")
        else:
            display(make_chart(l_noms))
    
    return None

#Running graph
box = widgets.HBox([selection, year, name]) 
out = widgets.interactive_output(visualisation, {'selection':selection, 'annee':year, 'nom':name});
display(box,out)

HBox(children=(Dropdown(description='Display :', options=(None, 'Top 5', 'Top 10', 'Bottom 5', 'Bottom 10'), v…

Output()