# Dataset simulation

In [1]:
import pandas as pd 
import numpy as np 
import random

from sklearn.utils import shuffle

In [2]:
n_students = 3000
np.random.seed(30)

# Possibilities
gender = ['M', 'F']
nationality = ['Dutch', 'English', 'Germany', 'Belgium']
education = ['Bachelor', 'Master', 'PhD']
age = [23, 24, 25, 26, 27, 28, 29]
highschool = ['VMBO', 'HAVO', 'VWO']
studyfield = ['Science', 'Math', 'Business', 'Economics', 'Psychology', 'Linguistics']
ethnicity = ['White', 'Black', 'Asian']
exchange = ['Yes', 'No']
grade = [6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10]
english = ['A', 'B', 'C']


# Probabilities
p_gender_beta = [0.75, 0.25]
p_gender_gamma = [0.5, 0.5]
p_gender_alpha = [0.25, 0.75]

p_nationalities = [0.7, 0.1, 0.1, 0.1]
p_education = [0.15, 0.75, 0.1]
p_age = [0.1, 0.2, 0.3, 0.2, 0.1, 0.05, 0.05]
p_highschool = [0.1, 0.2, 0.7]
p_studyfield = [0.2, 0.1, 0.3, 0.2, 0.1, 0.1]
p_ethnicity = [0.4, 0.3, 0.3]
p_exchange = [0.5, 0.5]
p_grade = [0.2, 0.2, 0.3, 0.1, 0.1, 0.05, 0.02, 0.02, 0.01]
p_english = [0.1, 0.5, 0.4]

# Beta studies
df_b = pd.DataFrame({'Gender':np.random.choice(gender, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_gender_beta),
        'Nationality': np.random.choice(nationality, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_nationalities),
        'Education': np.random.choice(education, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_education),
        'Age': np.random.choice(age, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_age),
        'Highschool': np.random.choice(highschool, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_highschool),
        'Studyfield': np.random.choice(['Science', 'Math'], int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = [0.66, 0.34]),
        'Ethnicity': np.random.choice(ethnicity, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_ethnicity),
        'Exchange': np.random.choice(exchange, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_exchange),
        'Grade': np.random.choice(grade, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_grade),
        'English': np.random.choice(english, int(n_students *(p_studyfield[0] + p_studyfield[1])) , replace = True, p = p_english)
       })

df_g = pd.DataFrame({'Gender':np.random.choice(gender, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_gender_gamma),
        'Nationality': np.random.choice(nationality, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_nationalities),
        'Education': np.random.choice(education, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_education),
        'Age': np.random.choice(age, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_age),
        'Highschool': np.random.choice(highschool, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_highschool),
        'Studyfield': np.random.choice(['Business', 'Economics'], int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = [0.66, 0.34]),
        'Ethnicity': np.random.choice(ethnicity, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_ethnicity),
        'Exchange': np.random.choice(exchange, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_exchange),
        'Grade': np.random.choice(grade, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_grade),
        'English': np.random.choice(english, int(n_students *(p_studyfield[2] + p_studyfield[3])) , replace = True, p = p_english)
       })

df_a = pd.DataFrame({'Gender':np.random.choice(gender, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_gender_alpha),
        'Nationality': np.random.choice(nationality, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_nationalities),
        'Education': np.random.choice(education, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_education),
        'Age': np.random.choice(age, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_age),
        'Highschool': np.random.choice(highschool, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_highschool),
        'Studyfield': np.random.choice(['Psychology', 'Linguistics'], int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = [0.66, 0.34]),
        'Ethnicity': np.random.choice(ethnicity, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_ethnicity),
        'Exchange': np.random.choice(exchange, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_exchange),
        'Grade': np.random.choice(grade, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_grade),
        'English': np.random.choice(english, int(n_students *(p_studyfield[4] + p_studyfield[5])) , replace = True, p = p_english)
       })
 
# Create DataFrame
df = pd.concat([df_b, df_g, df_a], axis=0)
df = shuffle(df)

Based on four variables a probability of getting targeted is constructed:
- Studyfield 
- Grade
- English 
- Education

Two forms of bias: 
- Sample bias: men are overrepresented / underrepresented in some studies
- Recruitment bias: men are more likely to be targeted than women

In [3]:
df['Prob'] = 0

# There are five categories:
# Very low:   0.05 
# Low:        0.1
# Neutral:    0.15
# High:       0.3
# Very high:  0.7


# Select relevant studies
df.loc[(df['Studyfield'] == "Psychology"), "Prob"] = 0.05
df.loc[(df['Studyfield'] == "Linguistics"), "Prob"] = 0.05

df.loc[(df['Studyfield'] == "Business"), "Prob"] = 0.15
df.loc[(df['Studyfield'] == "Economics"), "Prob"] = 0.15

df.loc[(df['Studyfield'] == "Math"), "Prob"] = 0.7
df.loc[(df['Studyfield'] == "Science"), "Prob"] = 0.7

#### Gender ####
# Does not matter.

#### Nationalities ####
# Does not matter.

#### Education ####
# PhD: Becomes 'Neutral' for studies with 'Very low' and 'High' for studies with 'Neutral'
# Master: normal
# Bachelor: Very low

df.loc[(df['Studyfield'] == "Psychology") &
       (df['Education'] == "PhD"), "Prob"] = 0.15
df.loc[(df['Studyfield'] == "Linguistics") &
       (df['Education'] == "PhD"), "Prob"] = 0.15
df.loc[(df['Studyfield'] == "Business") &
       (df['Education'] == "PhD"), "Prob"] = 0.3
df.loc[(df['Studyfield'] == "Economics") &
       (df['Education'] == "PhD"), "Prob"] = 0.3
df.loc[(df['Education'] == "Bachelor"), "Prob"] = 0.05

#### Age ####
# Does not matter.

#### Highschool ####
# Does not matter

#### Studyfield
# Most relevant. Default settings are provided above.

#### Ethnicity ####
# Does not matter

#### Exchange ####
# Does not matter

#### Grade ####
# Relevant Studies: Very high --> High if grade < 7.5
# Semirelevant Studies: Neutral --> low if grade < 7.5

df.loc[(df['Studyfield'] == "Business") &
       (df['Grade'] < 7.5), "Prob"] = 0.1
df.loc[(df['Studyfield'] == "Economics") &
       (df['Grade'] < 7.5), "Prob"] = 0.1
df.loc[(df['Studyfield'] == "Math") &
       (df['Grade'] < 7.5), "Prob"] = 0.3
df.loc[(df['Studyfield'] == "Science") &
       (df['Grade'] < 7.5), "Prob"] = 0.3


#### English ####
# Regardless the study, no english means basically no chance.
df.loc[(df['English'] == 'A'), 'Prob'] = 0.05


#### Introduce Bias ####
#Downweigh women with 5 - 10%
#Upweigh men with 5 - 10%
#df['Prob_Target'] = df.apply(lambda row: row['Prob']   if row['Gender'] == 'F'  else round(min(row['Prob'] * 1.11,1),2), axis = 1 )
#del df['Prob']

### Probabilities --> Binary ###
Target = [] 
for row in df['Prob']:
    Target.append(np.random.binomial(1, row))
    
df['Target'] = Target


df
## See ##
# df.head()

## Write to CSV ## 
df.to_csv(r"C:\Users\Jasper Rouschop\Documents\Internal Projects\Project Albatros\Case study - AI Recruiter\student_dataset.csv")