In this notebook we describe the code use to produce the baseline. 

# Data and Libraries

In [1]:
import pandas as pd
import pickle
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.pipeline import Pipeline
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
# Mettre ici le PATH vers le répertoire contenant le fichier Train.json
DATA_PATH = "/Users/Ben/Jupyter_Notebooks/Defi_IA/Data"

train_df = pd.read_json(DATA_PATH+"/train.json")
train_label = pd.read_csv(DATA_PATH+"/train_label.csv")

# test_df = pd.read_json(DATA_PATH+"/test.json") : inutile ici car pas de label associé

In [3]:
train_df

Unnamed: 0,Id,description,gender
0,0,She is also a Ronald D. Asmus Policy Entrepre...,F
1,1,He is a member of the AICPA and WICPA. Brent ...,M
2,2,Dr. Aster has held teaching and research posi...,M
4,3,He runs a boutique design studio attending cl...,M
5,4,"He focuses on cloud security, identity and ac...",M
...,...,...,...
271492,217192,A member of the UWA Cultural Collections Boar...,M
271493,217193,Kelly has worked globally leading teams of co...,F
271494,217194,He's the lead author of a recent study that f...,M
271495,217195,She specializes in the theoretical and pedago...,F


In [4]:
train_label

Unnamed: 0,Id,Category
0,0,19
1,1,9
2,2,19
3,3,24
4,4,24
...,...,...
217192,217192,19
217193,217193,22
217194,217194,19
217195,217195,19


# Cleaning

The only cleaning transformation applied here is that we `lower` the data so that all words are lower case. 
Hence `research`and `Research` will be considered as similar word.

You might want to look at other cleaning step such that removing stopwords, stemming words, etc.

In [5]:
train_df["description_clean"] = [x.lower() for x in train_df.description]

# test_df["description_clean"] = [x.lower() for x in test_df.description]

In [6]:
train_df

Unnamed: 0,Id,description,gender,description_clean
0,0,She is also a Ronald D. Asmus Policy Entrepre...,F,she is also a ronald d. asmus policy entrepre...
1,1,He is a member of the AICPA and WICPA. Brent ...,M,he is a member of the aicpa and wicpa. brent ...
2,2,Dr. Aster has held teaching and research posi...,M,dr. aster has held teaching and research posi...
4,3,He runs a boutique design studio attending cl...,M,he runs a boutique design studio attending cl...
5,4,"He focuses on cloud security, identity and ac...",M,"he focuses on cloud security, identity and ac..."
...,...,...,...,...
271492,217192,A member of the UWA Cultural Collections Boar...,M,a member of the uwa cultural collections boar...
271493,217193,Kelly has worked globally leading teams of co...,F,kelly has worked globally leading teams of co...
271494,217194,He's the lead author of a recent study that f...,M,he's the lead author of a recent study that f...
271495,217195,She specializes in the theoretical and pedago...,F,she specializes in the theoretical and pedago...


# Fairness

In [7]:
names = pd.read_csv(DATA_PATH+'/categories_string.csv')['0'].to_dict()
names  # dictionnaire entre numéro de catégorie et job string

{0: 'pastor',
 1: 'model',
 2: 'yoga_teacher',
 3: 'teacher',
 4: 'personal_trainer',
 5: 'painter',
 6: 'journalist',
 7: 'interior_designer',
 8: 'surgeon',
 9: 'accountant',
 10: 'dj',
 11: 'physician',
 12: 'comedian',
 13: 'software_engineer',
 14: 'nurse',
 15: 'poet',
 16: 'dentist',
 17: 'chiropractor',
 18: 'filmmaker',
 19: 'professor',
 20: 'photographer',
 21: 'rapper',
 22: 'psychologist',
 23: 'paralegal',
 24: 'architect',
 25: 'composer',
 26: 'attorney',
 27: 'dietitian'}

In [8]:
# A chaque index, fait correspondre le job en clair
jobs = train_label['Category']
jobs = jobs.map(names)
jobs = jobs.rename('job')

In [9]:
jobs

0            professor
1           accountant
2            professor
3            architect
4            architect
              ...     
217192       professor
217193    psychologist
217194       professor
217195       professor
217196           model
Name: job, Length: 217197, dtype: object

In [10]:
genders = train_df.set_index('Id')['gender']

In [11]:
genders

Id
0         F
1         M
2         M
3         M
4         M
         ..
217192    M
217193    F
217194    M
217195    F
217196    F
Name: gender, Length: 217197, dtype: object

In [12]:
people = pd.concat((jobs, genders), axis='columns')

In [13]:
people

Unnamed: 0,job,gender
0,professor,F
1,accountant,M
2,professor,M
3,architect,M
4,architect,M
...,...,...
217192,professor,M
217193,psychologist,F
217194,professor,M
217195,professor,F


In [14]:
counts = people.groupby(['job', 'gender']).size().unstack('gender')
counts

gender,F,M
job,Unnamed: 1_level_1,Unnamed: 2_level_1
accountant,1129,1992
architect,1314,4527
attorney,7106,11714
chiropractor,391,1015
comedian,345,1294
composer,553,2842
dentist,1895,3555
dietitian,2120,168
dj,125,706
filmmaker,1394,2730


In [15]:
counts['disparate_impact'] = counts[['M', 'F']].max(axis='columns') / counts[['M', 'F']].min(axis='columns')
counts.sort_values('disparate_impact', ascending=False)

gender,F,M,disparate_impact
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dietitian,2120,168,12.619048
rapper,64,719,11.234375
nurse,11493,1129,10.179805
surgeon,890,5726,6.433708
yoga_teacher,803,141,5.695035
dj,125,706,5.648
software_engineer,613,3447,5.623165
paralegal,814,153,5.320261
composer,553,2842,5.139241
model,3398,717,4.739191


In [16]:
def macro_disparate_impact(people):
    counts = people.groupby(['job', 'gender']).size().unstack('gender')
    counts['disparate_impact'] = counts[['M', 'F']].max(axis='columns') / counts[['M', 'F']].min(axis='columns')
    return counts

In [17]:
counts = macro_disparate_impact(people)
print(counts)
print ("BASELINE Disparate Impact = ", counts['disparate_impact'].mean())

gender                 F      M  disparate_impact
job                                              
accountant          1129   1992          1.764393
architect           1314   4527          3.445205
attorney            7106  11714          1.648466
chiropractor         391   1015          2.595908
comedian             345   1294          3.750725
composer             553   2842          5.139241
dentist             1895   3555          1.875989
dietitian           2120    168         12.619048
dj                   125    706          5.648000
filmmaker           1394   2730          1.958393
interior_designer    694    164          4.231707
journalist          6123   6172          1.008003
model               3398    717          4.739191
nurse              11493   1129         10.179805
painter             2131   2490          1.168466
paralegal            814    153          5.320261
pastor               360   1137          3.158333
personal_trainer     364    443          1.217033


# Préparation Data

In [18]:
descriptions = train_df.set_index('Id')['description_clean']

In [19]:
descriptions

Id
0          she is also a ronald d. asmus policy entrepre...
1          he is a member of the aicpa and wicpa. brent ...
2          dr. aster has held teaching and research posi...
3          he runs a boutique design studio attending cl...
4          he focuses on cloud security, identity and ac...
                                ...                        
217192     a member of the uwa cultural collections boar...
217193     kelly has worked globally leading teams of co...
217194     he's the lead author of a recent study that f...
217195     she specializes in the theoretical and pedago...
217196     since she was 10 years old she has become a m...
Name: description_clean, Length: 217197, dtype: object

In [20]:
X_train, X_test, y_train, y_test, gender_train, gender_test = model_selection.train_test_split(
    descriptions,
    jobs,
    genders,
    test_size=0.9,    # ratio du set utilisé pour le test
    random_state=42
)

In [21]:
X_train

Id
123299     he is regarded as the “king of the college ci...
186376     he leads a research group that uses numerical...
87971      he graduated cum laude with a b.s. in biology...
151965     she is the director of the cancer rehabilitat...
172959     currently, she is producing a documentary tha...
                                ...                        
119879     she was born in walsall on june 6, 1992. she ...
103694     prior to this role, elika worked from 2012 to...
131932     he was visiting scholar at indiana university...
146867     he w\has a net worth of $100 million. in addi...
121958     gallagher school. this is her 24th year teach...
Name: description_clean, Length: 21719, dtype: object

In [22]:
y_train

123299     comedian
186376    professor
87971       dentist
151965    professor
172959         poet
            ...    
119879        model
103694     attorney
131932    professor
146867       pastor
121958      teacher
Name: job, Length: 21719, dtype: object

In [23]:
X_test

Id
83232      her research area is engineering education. s...
19036      he joined the faculty at isu in 2007 followin...
35087      he likes to shoot unposed photographs of inte...
86945      he received a b.s. with honors in electrical ...
79762      in 2003 she began writing and editing for loc...
                                ...                        
80570      he has 22 years of experience. his specialtie...
180603     he has over 4.5 years of consulting experienc...
202523     she is passionate about holistic health and n...
25236      she serves as the washington bureau chief for...
163471     she is the author of two books, the woodcarve...
Name: description_clean, Length: 195478, dtype: object

In [24]:
y_test

83232        professor
19036        professor
35087     photographer
86945        professor
79762        architect
              ...     
80570          dentist
180603       professor
202523    photographer
25236       journalist
163471            poet
Name: job, Length: 195478, dtype: object

In [25]:
gender_test

Id
83232     F
19036     M
35087     M
86945     M
79762     F
         ..
80570     M
180603    M
202523    F
25236     F
163471    F
Name: gender, Length: 195478, dtype: object

## Fonction évaluation d'un modèle

In [26]:
def eval(modele, nom = "inconnu"):
    # fit et mesure temps
    print("évaluation modèle : " + nom)
    print("----------------------")
    print("start training")
    start = time.perf_counter()
    modele = modele.fit(X_train, y_train)
    end = time.perf_counter()
    duree_min = int((end - start)/60)
    print("fin training en "+str(duree_min)+" minutes")
    print("----------------------")
    print(" ")
    # prédictions
    y_pred = modele.predict(X_test)
    y_pred = pd.Series(y_pred, name='job', index=X_test.index)
    # F1-scores et Macro-F1
    scores = f1_score(y_test, y_pred, average=None, sample_weight=None, zero_division='warn')
    print("F1 scores = ", scores)
    print ("Macro-F1 = ", scores.mean())
    print(" ")
    # Disparate Impact
    test_people = pd.concat((y_pred, gender_test), axis='columns')
    counts = macro_disparate_impact(test_people)
    print("----------------------")
    print(counts)
    print(" ")
    print("Score disparate impact = ", counts['disparate_impact'].mean())
    print("----------------------")

# BASELINE - Modèle Regression Logistique

We use TfidfVectorizer to transform words from text to numerical vector data.  

More vectorize are available on scikit-learn -> https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

You also may want to have a look at words embedding methods (Word2vec, Glove, etc..)

In [27]:
vect = TfidfVectorizer()  # Vectorisation 'basique'

In [28]:
norm = preprocessing.Normalizer()   # Normalisation

In [29]:
core = LogisticRegression(multi_class='multinomial',
                          tol = 0.1, # pour une convergence rapide
                          max_iter = 1000)    # Regression logistique basique

In [30]:
liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

In [31]:
modele = Pipeline(liste_composants)

In [32]:
# eval(modele,"regresion logistique")

# MODELE 1 - Regression Logistique avec Weighted Classes

In [33]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
norm = preprocessing.Normalizer()   # Normalisation
core = LogisticRegression(multi_class='multinomial',
                          tol = 0.1, # pour une convergence rapide
                          max_iter = 1000,
                          class_weight = 'balanced',   # classes pondérées
                         )   

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [34]:
# eval(modele,"regresion logistique avec weighted classes")

# MODELE 2 - SVM weighted classes

In [35]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
norm = preprocessing.Normalizer()   # Normalisation
core = SVC(C = 1.0, # coeff régularisation L2
           kernel = 'rbf', # Gaussian Kernel, classique, 
           # https://en.wikipedia.org/wiki/Radial_basis_function_kernel
           gamma = 'scale', # choix paramètre du Gaussian Kernel
           tol = 0.1,  # tolérance basse pour ne pas trop attendre
           class_weight = 'balanced',  # permet d'affecter des poids en fonction de la fréquence des classes.
           max_iter = 10000,   # on augmente le nombre d'itérations pour avoir la convergence
           decision_function_shape = 'ovr', # one-versus-rest classifiers en multi-classes
           verbose = True
           )

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [36]:
# eval(modele,"SVM weighted classes")

# MODELE 3 - Decision Tree

In [37]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
# norm = preprocessing.Normalizer()   # Normalisation
core = DecisionTreeClassifier(max_depth = None,  # profondeur max de l'arbre
                              min_samples_split = 2,  # taille minimum pour splitter une branche : 2 points
                              random_state = 42,
                              # class_weight = 'balanced',
                             )

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [38]:
eval(modele,"Decision Tree")

évaluation modèle : Decision Tree
----------------------
start training
fin training en 0 minutes
----------------------
 
F1 scores =  [0.45844749 0.37347095 0.71165255 0.40589474 0.50164799 0.46412519
 0.82246781 0.59583122 0.32770745 0.53825575 0.28496389 0.44384597
 0.48198395 0.75320261 0.50909535 0.25091799 0.19582665 0.25849213
 0.65454684 0.54767079 0.4969945  0.7461078  0.44780355 0.26596675
 0.33480047 0.52747253 0.32686742 0.54044549]
Macro-F1 =  0.47380377917416505
 
----------------------
gender                 F      M  disparate_impact
job                                              
accountant          1009   1679          1.664024
architect           1436   3771          2.626045
attorney            6911  10888          1.575459
chiropractor         211    901          4.270142
comedian             460   1108          2.408696
composer             596   2335          3.917785
dentist             1755   3220          1.834758
dietitian           1669    199          8.

# MODELE 4 - Decision Tree weighted classes

In [39]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
# norm = preprocessing.Normalizer()   # Normalisation
core = DecisionTreeClassifier(max_depth = None,  # profondeur max de l'arbre
                              min_samples_split = 2,  # taille minimum pour splitter une branche : 2 points
                              random_state = 42,
                              # class_weight = 'balanced',
                             )

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [40]:
# eval(modele,"Decision Tree weighted classes")

# MODELE 5 - Random Forest

In [41]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
norm = preprocessing.Normalizer()   # Normalisation
core = RandomForestClassifier(n_estimators=1000, # nombre de Decision Trees
                              max_depth = None,  # profondeur max de l'arbre
                              min_samples_split = 2,  # taille minimum pour splitter une branche
                              random_state = 42,
                              # class_weight = 'balanced',
                             )

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [42]:
eval(modele,"Random Forest")

évaluation modèle : Random Forest
----------------------
start training
fin training en 11 minutes
----------------------
 
F1 scores =  [0.35010297 0.24346141 0.82005027 0.25342466 0.62773072 0.45208518
 0.86703057 0.37697953 0.14105793 0.59839423 0.04342273 0.52660934
 0.58139974 0.77570917 0.57310897 0.0045403  0.07223796 0.01861702
 0.75767773 0.61218711 0.50893647 0.68295771 0.42975614 0.11827957
 0.18929845 0.60712292 0.24510188 0.31658768]
Macro-F1 =  0.42120958491666766
 
----------------------
gender                   F        M  disparate_impact
job                                                  
accountant           274.0    338.0          1.233577
architect            206.0    731.0          3.548544
attorney            6497.0  10395.0          1.599969
chiropractor          22.0    175.0          7.954545
comedian             166.0    611.0          3.680723
composer             138.0   1294.0          9.376812
dentist             1510.0   2762.0          1.829139
dietit

# MODELE 6 - Random Forest weighted classes

In [43]:
vect = TfidfVectorizer()  # Vectorisation 'basique'
norm = preprocessing.Normalizer()   # Normalisation
core = RandomForestClassifier(n_estimators=1000, # nombre de Decision Trees
                              max_depth = None,  # profondeur max de l'arbre
                              min_samples_split = 2,  # taille minimum pour splitter une branche
                              random_state = 42,
                              class_weight = 'balanced',
                             )

liste_composants = [('Vectorisation', vect),
                    ('Normalisation', norm),
                    ('Coeur', core)]

modele = Pipeline(liste_composants)

In [44]:
# eval(modele,"Random Forest weighted classes")