In [1]:
import os
import numpy as np
import pandas as pd

# Load Data

## Load Uploaded Data to classify

In [2]:
# Choose file
filename = "german-500.csv"
path = os.path.join("..","raw_data","test",filename)

In [3]:
path

'../raw_data/test/german-500.csv'

In [4]:
df = pd.read_csv(path, sep=";", encoding='latin-1')

In [5]:
df.head()

Unnamed: 0,name,country,continent
0,Marie,Austria,Europe
1,Sophie,Germany,Europe
2,Maximilian,Germany,Europe
3,Maria,Brazil,Americas
4,Alexander,,


## Load Name Database

In [6]:
def load_data():
    path = os.path.join("..","raw_data","wgnd_2_0_name-gender-code.csv")
    df = pd.read_csv(path)

    # Clean data
    # kick out where gender is not defined
    df = df[df["gender"] !="?"]

    # add country names and regions
    path_cc = os.path.join("..","raw_data","country-code","all-country-codes_continent_subregion.csv")
    cc_all = pd.read_csv(path_cc, sep=",")
    cc_short = cc_all[["alpha-2", "name", "region", "sub-region"]].copy()
    cc_short.rename(columns={"name":"country"}, inplace=True)
    df.rename(columns={"code":"alpha-2" }, inplace=True)

    # merge data and country info
    data = pd.merge(df, cc_short, how="left", on="alpha-2")
    continent_list = data["region"].unique()
    # clean merged data
    data = data[data["alpha-2"] != "??"] #filter out names where there is no country assigned

    return data

In [7]:
data = load_data()

In [8]:
data.sample(5)

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region
1099690,haribhaskararao,IN,M,1.0,India,Asia,Southern Asia
2985469,sushaalamma,IN,F,1.0,India,Asia,Southern Asia
1044383,gugulotupranin,IN,M,1.0,India,Asia,Southern Asia
3445980,zakia,EG,F,1.0,Egypt,Africa,Northern Africa
3298254,vidhyasagara,IN,M,1.0,India,Asia,Southern Asia


# Clean and convert Data

## Add country and continent list in smal letters

In [9]:
def lower_word(word):
    return word.lower()

In [10]:
# Country list
country_list = data["country"].unique()
lower_vectorized = np.vectorize(lower_word)
country_list =lower_vectorized(country_list)

In [11]:
# Continent List
continents_list = data["region"].unique()
continents_list = lower_vectorized(continents_list)

## Convert input to string

In [12]:
df["name"] = df["name"].astype(str)

In [13]:
df["country"] = df["country"].astype(str)

In [14]:
df["continent"] = df["continent"].astype(str)

# Test for single name

In [15]:
# For name and country
df_test = data[(data["name"] == "paul") & (data["country"] == "Brazil")]
df_test

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region


In [16]:
# For name only
df_test = data[(data["name"] == "paul")]
df_test

Unnamed: 0,name,alpha-2,gender,wgt,country,region,sub-region
2269067,paul,AG,M,1.000000,Antigua and Barbuda,Americas,Latin America and the Caribbean
2269068,paul,AT,M,1.000000,Austria,Europe,Western Europe
2269069,paul,AU,F,0.000153,Australia,Oceania,Australia and New Zealand
2269070,paul,AU,M,0.999847,Australia,Oceania,Australia and New Zealand
2269071,paul,BB,M,1.000000,Barbados,Americas,Latin America and the Caribbean
...,...,...,...,...,...,...,...
2269152,paul,VU,M,1.000000,Vanuatu,Oceania,Melanesia
2269153,paul,WS,M,1.000000,Samoa,Oceania,Polynesia
2269154,paul,ZA,M,1.000000,South Africa,Africa,Sub-Saharan Africa
2269155,paul,ZM,M,1.000000,Zambia,Africa,Sub-Saharan Africa


# Main functions

In [17]:
def share_male_female(result):
    """
    Takes a dataframe and calculates the probability of femal and male
    """
    male = 0
    female = 0
    male_p = 0
    female_p = 0
    # Make a list out of the grouped table results
    result_list = []
    try: result_list.append(["M", result["M"] ])
    except: pass
    try: result_list.append(["F", result["F"] ])
    except: pass

    # Calculate percentage results
    for res in result_list:
        if res[0] == "M":
            male = res[1]
        elif res[0] == "F":
            female = res[1]
        male_p = round(male*100/(male+female),2)
        female_p = round(female*100/(female+male),2)
    
    if male_p > female_p:
        gender = "m"
        return gender, male_p
    else:
        gender = "f"
        return gender, female_p


In [18]:
def iterrows_predict_from_data(df, data):
    """
    Create a temporary dataframe with name and gender in differnt countries
    Adds rows with gender and percentage to the input dataframe 
    """
    # Still takes to long - need to be converted into apply or applymap
    for index, row in df.iterrows():

        if df.loc[index, "name"] != "nan" and df.loc[index, "country"].lower() in country_list :
            print("name and country")
            df_name = data[(data["name"] == df.loc[index, "name"].lower()) & (data["country"] == df.loc[index, "country"])].groupby("gender")["wgt"].sum()
        elif df.loc[index, "name"] != "nan" and df.loc[index, "continent"] in continents_list:
            print("name and continent")
            df_name = data[(data["name"] == df.loc[index,"name"].lower()) & (data["region"] == df.loc[index, "continent"])].groupby("gender")["wgt"].sum()
        elif df.loc[index,"name"] != "nan":
            print("only name")
            df_name = data[data["name"] == df.loc[index, "name"].lower()].groupby("gender")["wgt"].sum()
        else:
            print("no data given")
            df.loc[index,"gender"] = "Name not found"

        if df_name.empty:
            df.loc[index, "gender"] = "No Data"
            df.loc[index, "percentage"] = "No Data"
        else:
            gender, perc = share_male_female(df_name)
            df.loc[index, "gender"] = gender
            df.loc[index, "percentage"] = perc


    return  df
    

In [19]:
# Calculate for uploaded Datafram


In [20]:
%%time
df_new = iterrows_predict_from_data(df, data)

name and country
name and country
name and country
name and country
only name
name and country
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only name
only 

In [21]:
df_new

Unnamed: 0,name,country,continent,gender,percentage
0,Marie,Austria,Europe,f,100.0
1,Sophie,Germany,Europe,f,100.0
2,Maximilian,Germany,Europe,m,100.0
3,Maria,Brazil,Americas,f,100.0
4,Alexander,,,m,99.99
...,...,...,...,...,...
494,Dennis,,,m,99.94
495,Elijah,,,m,99.91
496,Emanuel,,,m,99.99
497,Ensar,,,m,100.0


# Save Result


In [24]:
# Choose file
path_save = os.path.join("..","raw_data","test",f"genderized-{filename}")
path_save

'../raw_data/test/genderized-german-500.csv'

In [27]:
df_new.to_csv(path_save, index=False)