In [1]:
import pandas as pd
import os

# Load Data

In [2]:
path = os.path.join("..","raw_data","wgnd_2_0_name-gender-code.csv")

In [3]:
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,name,code,gender,wgt
0,"""baby""",AU,F,1.0
1,'aisyah,AU,F,1.0
2,'anela,CA,F,1.0
3,'fiyinfoluwa,CA,F,1.0
4,'olioni,AU,M,1.0


In [5]:
df["gender"].value_counts()

M    2544944
F    2032646
?     392706
Name: gender, dtype: int64

## Clean Data

In [6]:
df = df[df["gender"] !="?"]

In [7]:
df["name"].value_counts()

alexis              164
marion              159
florence            158
sara                138
lee                 137
                   ... 
kurubal               1
kurubala              1
kurubalakkappaga      1
kurubalakota          1
凉翼                    1
Name: name, Length: 3781203, dtype: int64

In [8]:
df.head(20)

Unnamed: 0,name,code,gender,wgt
0,"""baby""",AU,F,1.0
1,'aisyah,AU,F,1.0
2,'anela,CA,F,1.0
3,'fiyinfoluwa,CA,F,1.0
4,'olioni,AU,M,1.0
5,'piaf',AU,F,1.0
6,'s,BE,F,1.0
7,'true,GB,F,1.0
8,'uheina-'i-langima'a,AU,F,1.0
9,.alexandra,CA,F,1.0


### Check data

In [9]:
df.shape

(4577590, 4)

In [10]:
df[df["name"] == "andrea"]

Unnamed: 0,name,code,gender,wgt
159181,andrea,AG,F,1.00000
159182,andrea,AL,F,0.50000
159183,andrea,AL,M,0.50000
159184,andrea,AT,F,1.00000
159185,andrea,AU,F,0.93062
...,...,...,...,...
159263,andrea,VU,F,1.00000
159264,andrea,WS,F,1.00000
159265,andrea,ZA,F,1.00000
159266,andrea,ZM,F,1.00000


In [11]:
df[df["name"] == "andrea"].groupby("gender").sum()

  df[df["name"] == "andrea"].groupby("gender").sum()


Unnamed: 0_level_0,wgt
gender,Unnamed: 1_level_1
F,73.771014
M,3.228986


## Add country-names and regions

In [70]:
path_cc = os.path.join("..","raw_data","country-code","all-country-codes_continent_subregion.csv")

In [98]:
cc_all = pd.read_csv(path_cc, sep=",")
cc_all.head(4)

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,


In [72]:
cc_short = cc_all[["alpha-2", "name", "region", "sub-region"]].copy()

In [79]:
cc_short

Unnamed: 0,alpha-2,country,region,sub-region
0,AF,Afghanistan,Asia,Southern Asia
1,AX,Åland Islands,Europe,Northern Europe
2,AL,Albania,Europe,Southern Europe
3,DZ,Algeria,Africa,Northern Africa
4,AS,American Samoa,Oceania,Polynesia
...,...,...,...,...
247,WF,Wallis and Futuna,Oceania,Polynesia
248,EH,Western Sahara,Africa,Northern Africa
249,YE,Yemen,Asia,Western Asia
250,ZM,Zambia,Africa,Sub-Saharan Africa


In [80]:
cc_short.rename(columns={"name":"country"}, inplace=True)

In [81]:
df.rename(columns={"code":"alpha-2" }, inplace=True)

In [82]:
data = pd.merge(df, cc_short, how="left", on="alpha-2")

In [83]:
data.head()

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region
0,"""baby""",AU,F,1.0,Australia,Oceania,Australia and New Zealand
1,'aisyah,AU,F,1.0,Australia,Oceania,Australia and New Zealand
2,'anela,CA,F,1.0,Canada,Americas,Northern America
3,'fiyinfoluwa,CA,F,1.0,Canada,Americas,Northern America
4,'olioni,AU,M,1.0,Australia,Oceania,Australia and New Zealand


In [110]:
data["region"].unique()

array(['Oceania', 'Americas', 'Europe', 'Asia', 'Africa'], dtype=object)

In [87]:
data[data["country"].isnull()]["alpha-2"].unique()

array(['??'], dtype=object)

In [88]:
data = data[data["alpha-2"] != "??"]

In [99]:
data.head()

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region
0,"""baby""",AU,F,1.0,Australia,Oceania,Australia and New Zealand
1,'aisyah,AU,F,1.0,Australia,Oceania,Australia and New Zealand
2,'anela,CA,F,1.0,Canada,Americas,Northern America
3,'fiyinfoluwa,CA,F,1.0,Canada,Americas,Northern America
4,'olioni,AU,M,1.0,Australia,Oceania,Australia and New Zealand


In [100]:
data["region"].unique()

array(['Oceania', 'Americas', 'Europe', 'Asia', 'Africa'], dtype=object)

In [265]:
def share_male_female(result):
    male = 0
    female = 0
    # Make a list out of the grouped table results
    result_list = []
    try: result_list.append(["M", result["M"] ])
    except: pass
    try: result_list.append(["F", result["F"] ])
    except: pass

    # Calculate percentage results
    for res in result_list:
        if res[0] == "M":
            male = res[1]
        elif res[0] == "F":
            female = res[1]
        male_p = round(male*100/(male+female),2)
        female_p = round(female*100/(female+male),2)

    return male_p, female_p

In [266]:
def precit_from_data(name, country, continent):
    if name and country:
        print("name and country")
        df_name = data[(data["name"] == name.lower()) & (data["country"] == country)].groupby("gender")["wgt"].sum()
    elif name and continent:
        print("name and continent")
        df_name = data[(data["name"] == name.lower()) & (data["region"] == continent)].groupby("gender")["wgt"].sum()
    elif name:
        print("name")
        df_name = data[data["name"] == "andrea"].groupby("gender")["wgt"].sum()
    else:
        print("no data given")
        
    male_p, female_p = share_male_female(df_name)
    return male_p, female_p
        

In [267]:

name = "Dieter"
country = "Germany"
continent = None

In [269]:
male_p, female_p = precit_from_data(name, country, continent)
print(f"Male: {male_p} %")
print(f"Female: {female_p} %")

name and country
Male: 100.0 %
Female: 0.0 %


In [187]:
femal_p

0

{'M': 1.0}

In [None]:
# name and continent

In [None]:
data[(data["name"] == "dieter") & (data["region"] == "Europe")].groupby("gender")["wgt"].sum()

745824    1.0
Name: wgt, dtype: float64

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region
745819,dieter,AT,M,1.0,Austria,Europe,Western Europe
745821,dieter,BE,M,1.0,Belgium,Europe,Western Europe
745823,dieter,CH,M,1.0,Switzerland,Europe,Western Europe
745824,dieter,DE,M,1.0,Germany,Europe,Western Europe
745825,dieter,ES,M,1.0,Spain,Europe,Southern Europe


In [108]:
prediction = test_name.groupby("gender")["wgt"].sum()
prediction

gender
F    1.835673
M    6.164327
Name: wgt, dtype: float64

In [138]:
prediction["M"]

6.1643269599999995