In [1]:
# Import the Altair library
import pandas as pd
import altair as alt
import numpy as np
# work-around to let Altair handle larger data sets
alt.data_transformers.enable('json')


# Reference to data
france = pd.read_csv('./src/assets/dpt2020.csv', sep=',')

print(france.columns)

Index(['sexe', 'preusuel', 'annais', 'dpt', 'nombre'], dtype='object')


In [8]:
import csv

def optimize(csv_data):
    if csv_data is None:
        raise Exception("CSV was not loaded when optimize was called")
    else:
        print("Optimizing dataset")

    filteredByNameByYear = {}
     
    for _,row in csv_data.iterrows():
        annais = row['annais']
        preusuel = row['preusuel']
        if preusuel not in filteredByNameByYear:
            filteredByNameByYear[preusuel] = {}
        if row.annais not in filteredByNameByYear[preusuel]:
            filteredByNameByYear[preusuel][annais] = []
        filteredByNameByYear[preusuel][annais].append(row)

    print("Optimization done")
    return filteredByNameByYear

opti_france = optimize(france)


Optimizing dataset
Optimization done


In [12]:
opti_france['CLAUDE']['1900']

[sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             01
 nombre          41
 Name: 312150, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             03
 nombre          44
 Name: 312151, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             13
 nombre          12
 Name: 312152, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             17
 nombre           4
 Name: 312153, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             21
 nombre           9
 Name: 312154, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             25
 nombre           6
 Name: 312155, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             26
 nombre           3
 Name: 312156, dtype: object,
 sexe             1
 preusuel    CLAUDE
 annais        1900
 dpt             29
 nombre   

In [9]:
# Display the evolution of the proportion of male 'CLAUDE' in France over the years

selected_name = 'CLAUDE'

filtered_name = france[(france['preusuel'] == selected_name) & (france['annais'] != 'XXXX')]
search_input = alt.selection_point(
    fields = ["preusuel"],
    value = selected_name,
    bind = alt.binding(
        input="search",
        placeholder="Name",
        name='Search',
    )
)
filtered_france =  france[(france['annais'] != 'XXXX')]

gender_proportion_graph = alt.Chart(opti_france).transform_filter(
    search_input
).transform_aggregate(
    sum_selection='sum(nombre)',
    groupby=['annais', 'sexe']
).transform_joinaggregate(
    total_sum='sum(sum_selection)',
    groupby=['annais']
).transform_calculate(
    proportion='datum.sum_selection / datum.total_sum*100',
    sex_label='datum.sexe === 1 ? "Male" : "Female"'
).mark_bar().encode(
    x=alt.X('annais:O', title='Year', scale=alt.Scale(domain=[str(year) for year in range(1900, 2021)])),
    y=alt.Y('proportion:Q', title='Percentage by sex'),
    color=alt.Color('sex_label:N',scale=alt.Scale(domain=['Male', 'Female'],range=['red', 'blue']),legend=alt.Legend(title='Sexe')),
    tooltip=[
        alt.Tooltip('annais', title='Year'),
        alt.Tooltip('proportion:Q', title='Percentage'),
        alt.Tooltip("sum_selection:Q", title="Number of births"),
        alt.Tooltip("sex_label:N", title="Sex")
    ]
).properties(
    width=1000,
    height=400,
    title="Births for searched name"
).add_params(
    search_input
)
### this is a bar chart
gender_proportion_graph

KeyboardInterrupt: 

In [9]:
selected_name = 'CLAUDE'

filtered_F_selected_name = france[(france['sexe'] == 2) & (
    france['preusuel'] == 'CLAUDE') & (france['annais'] != 'XXXX')]
filtered_M_selected_name = france[(france['sexe'] == 1) & (
    france['preusuel'] == 'CLAUDE')& (france['annais'] != 'XXXX')]

print(sum(filtered_F_selected_name['nombre']))

54031


In [57]:
# Display the evolution of the proportion of male 'CLAUDE' in France over the years

selected_name = 'MARIE'
filtered_name = france[(france['preusuel'] == selected_name) & (france['annais'] != 'XXXX')]

gender_proportion_graph = alt.Chart(filtered_name).transform_aggregate(
    sum_selection='sum(nombre)',
    groupby=['annais', 'sexe']
).transform_calculate(
    sex_label='datum.sexe === 1 ? "Male" : "Female"'
).mark_circle().encode(
    x=alt.X('annais:O', title='Year', scale=alt.Scale(domain=[str(year) for year in range(1900, 2021)])),
    y=alt.Y('sum_selection:Q', title='Percentage by sex'),
    color=alt.Color('sex_label:N',scale=alt.Scale(domain=['Male', 'Female'],range=['red', 'blue']),legend=alt.Legend(title='Sexe')),
    tooltip=[
        alt.Tooltip('annais', title='Year'),
        alt.Tooltip("sum_selection:Q", title="Number of births"),
        alt.Tooltip("sex_label:N", title="Sex")
    ]

).properties(
    width=1000,
    height=500,
    title="Births for the name "+selected_name
)


###this is a scatter plot
gender_proportion_graph

Le but c'est de trouver les proportions de prénoms selon les genres, par année. Donc j'essaie de faire un sorte d'avoir le nombre dans 2 tables différentes pour pouvoir faire le rapport.



In [2]:
# Display the evolution of the proportion of male 'CLAUDE' in France over the years

selected_name = 'CLAUDE'

filtered_name = france[(france['preusuel'] == selected_name) & (france['annais'] != 'XXXX')]
search_input = alt.selection_point(
    fields = ["preusuel"],
    value = selected_name,
    bind = alt.binding(
        input="search",
        placeholder="Name",
        name='Search',
    )
)
filtered_france =  france[(france['annais'] != 'XXXX')]

gender_proportion_graph = alt.Chart(filtered_france).transform_filter(
    search_input
).transform_aggregate(
    sum_selection='sum(nombre)',
    groupby=['annais', 'sexe']
).transform_joinaggregate(
    total_sum='sum(sum_selection)',
    groupby=['annais']
).transform_calculate(
    proportion='datum.sum_selection / datum.total_sum*100',
    sex_label='datum.sexe === 1 ? "Male" : "Female"'
).mark_bar().encode(
    x=alt.X('annais:O', title='Year', scale=alt.Scale(domain=[str(year) for year in range(1900, 2021)])),
    y=alt.Y('proportion:Q', title='Percentage by sex'),
    color=alt.Color('sex_label:N',scale=alt.Scale(domain=['Male', 'Female'],range=['red', 'blue']),legend=alt.Legend(title='Sexe')),
    tooltip=[
        alt.Tooltip('annais', title='Year'),
        alt.Tooltip('proportion:Q', title='Percentage'),
        alt.Tooltip("sum_selection:Q", title="Number of births"),
        alt.Tooltip("sex_label:N", title="Sex")
    ]
).properties(
    width=1000,
    height=400,
    title="Births for searched name"
).add_params(
    search_input
)
### this is a bar chart
gender_proportion_graph

In [5]:
# Display the evolution of the proportion of male 'CLAUDE' in France over the years

selected_name = 'CAMILLE'
filtered_name = france[(france['preusuel'] == selected_name) & (france['annais'] != 'XXXX')]

gender_proportion_graph = alt.Chart(filtered_name).transform_aggregate(
    sum_selection_M='sum(nombre)',
    groupby=['annais', 'sexe']
).transform_joinaggregate(
    total_sum='sum(sum_selection_M)',
    groupby=['annais']
).transform_calculate(
    proportion='datum.sum_selection_M / datum.total_sum*100',
    sex_label='datum.sexe === 1 ? "Male" : "Female"'
).transform_filter(
    (alt.datum.sexe == 1) | (alt.datum.sexe == 2)
).mark_circle().encode(
    x=alt.X('annais:O', title='Year', scale=alt.Scale(domain=[str(year) for year in range(1900, 2021)])),
    y=alt.Y('proportion:Q', title='Percentage by sex'),
    color=alt.Color('sex_label:N',scale=alt.Scale(domain=['Male', 'Female'],range=['red', 'blue']),legend=alt.Legend(title='Sexe')),
    tooltip=[
        alt.Tooltip('annais', title='Year'),
        alt.Tooltip('proportion:Q', title='Percentage'),
        alt.Tooltip("sum_selection_M:Q", title="Number of births"),
        alt.Tooltip("sex_label:N", title="Sex")
    ]
).properties(
    width=1000,
    height=400,
    title="Births for the name "+selected_name
)


###this is a scatter plot
gender_proportion_graph