In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk

In [3]:
# Importation des jeux de données de CV et d'étiquettes
import pandas as pd
df = pd.read_json('data.json')
df.description
label = pd.read_csv('label.csv')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
from sklearn import svm
from nltk.tokenize import word_tokenize
from sklearn import metrics
import numpy as np

In [70]:
# Voilà comment choisir un échantillon. On sélectionne un nombre plus petit par soucis de performances. Optionnel
df = df.loc[0:10000]
print(df)

         id                                        description gender
0         0   She is also a Ronald D. Asmus Policy Entrepre...      F
1         1   He is a member of the AICPA and WICPA. Brent ...      M
2         2   Dr. Aster has held teaching and research posi...      M
4         3   He runs a boutique design studio attending cl...      M
5         4   He focuses on cloud security, identity and ac...      M
...     ...                                                ...    ...
9995   8011   His most recent poetry collection is The Name...      M
9996   8012   He is a past president of the Houston Psychol...      M
9998   8013   He also holds an appointment in Molecular Phy...      M
9999   8014   She has been scrapbooking since 2002, mostly ...      F
10000  8015   She has worked in the field of disaster manag...      F

[8016 rows x 3 columns]


In [71]:
# Voilà comment choisir un échantillon. On sélectionne un nombre plus petit par soucis de performances. Optionnel
label = label.loc[0:8015]
print(label)

        Id  Category
0        0        19
1        1         9
2        2        19
3        3        24
4        4        24
...    ...       ...
8011  8011        15
8012  8012        22
8013  8013        19
8014  8014        11
8015  8015        19

[8016 rows x 2 columns]


In [6]:
# Phase de préprocessing : on met en minuscule pour que les mots qui commencent
# par une majuscule et une miniscule ne soit pas compter différemment.
df['description'] = df['description'].str.lower()
print(df)

            Id                                        description gender
0            0   she is also a ronald d. asmus policy entrepre...      F
1            1   he is a member of the aicpa and wicpa. brent ...      M
2            2   dr. aster has held teaching and research posi...      M
4            3   he runs a boutique design studio attending cl...      M
5            4   he focuses on cloud security, identity and ac...      M
...        ...                                                ...    ...
271492  217192   a member of the uwa cultural collections boar...      M
271493  217193   kelly has worked globally leading teams of co...      F
271494  217194   he's the lead author of a recent study that f...      M
271495  217195   she specializes in the theoretical and pedago...      F
271496  217196   since she was 10 years old she has become a m...      F

[217197 rows x 3 columns]


In [7]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
# On supprime les virgules, points...
df['description'] = df['description'].str.replace('[^\w\s]','')
print(df['description'])

0          she is also a ronald d asmus policy entrepren...
1          he is a member of the aicpa and wicpa brent g...
2          dr aster has held teaching and research posit...
4          he runs a boutique design studio attending cl...
5          he focuses on cloud security identity and acc...
                                ...                        
271492     a member of the uwa cultural collections boar...
271493     kelly has worked globally leading teams of co...
271494     hes the lead author of a recent study that fo...
271495     she specializes in the theoretical and pedago...
271496     since she was 10 years old she has become a m...
Name: description, Length: 217197, dtype: object


In [37]:
# On isole chaque mot de chaque colonne afin de pouvoir les compter. Nous n'avons finalement pas exécuter cette cellule car on ne peut pas passer de colonnes tokeniser
# en paramètre de la fonction train_test_split.
# resumeExtractToken = resumeExtract.apply(word_tokenize)
# print(resumeExtractToken)

0        [she, is, also, a, ronald, d, asmus, policy, e...
1        [he, is, a, member, of, the, aicpa, and, wicpa...
2        [dr, aster, has, held, teaching, and, research...
4        [he, runs, a, boutique, design, studio, attend...
5        [he, focuses, on, cloud, security, identity, a...
                               ...                        
9995     [his, most, recent, poetry, collection, is, th...
9996     [he, is, a, past, president, of, the, houston,...
9998     [he, also, holds, an, appointment, in, molecul...
9999     [she, has, been, scrapbooking, since, 2002, mo...
10000    [she, has, worked, in, the, field, of, disaste...
Name: description, Length: 8016, dtype: object


In [38]:
# On applique d'autres préprocessing pour voir leur influence sur le f1 score
# On commence par enlever les stopwords : on récupère la liste de stopwords en 
# anglais. Nous n'avons finalement pas exécuter cette cellule car on ne peut pas passer de colonnes tokeniser en paramètre de la fonction train_test_split.
#nltk.download('stopwords')
#stop_words = stopwords.words('english')
#print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\remiv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# Nous n'avons finalement pas exécuter cette cellule car on ne peut pas passer de colonnes tokeniser en paramètre de la fonction train_test_split.
#resumeExtractToken = resumeExtractToken.apply(lambda x: [item for item in x if item not in stop_words])
#print(resumeExtractToken)

0        [also, ronald, asmus, policy, entrepreneur, fe...
1        [member, aicpa, wicpa, brent, graduated, unive...
2        [dr, aster, held, teaching, research, position...
4        [runs, boutique, design, studio, attending, cl...
5        [focuses, cloud, security, identity, access, m...
                               ...                        
9995     [recent, poetry, collection, names, things, ne...
9996     [past, president, houston, psychological, asso...
9998     [also, holds, appointment, molecular, physiolo...
9999     [scrapbooking, since, 2002, mostly, european, ...
10000    [worked, field, disaster, management, 20, year...
Name: description, Length: 8016, dtype: object


In [9]:
# On sépare nos extraits de CV et leurs catégories en données d'entraînement  
# et de tests de manière aléatoire.
X_train,X_test,Y_train,Y_test = train_test_split(df['description'],label.Category.values, test_size=0.30)

In [10]:
X_test_save = X_test

In [None]:
# On utilise TfidfVectorizer pour repérer la fréquence d'apparition de chaque
# mots présents dans nos données d'entraînements.
transformer = TfidfVectorizer().fit(X_train.values)
X_train = transformer.transform(X_train.values)
X_test = transformer.transform(X_test.values)

In [11]:
# On utilise le modèle d'apprentissage automatique régression linéaire
model = LogisticRegression(max_iter=2000)
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)



In [12]:
# On calcule le f1_score (indicateur de précision, + on est proche de 1 + on est précis)
# La valeur f1_score est la moyenne du f1_score pour chaque métier.
# On effectue une pondération en fonction du nombre d'occurences d'un métier.
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.7152675060837723


In [60]:
# On affiche les paramètres de linear regression.
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 2000,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [58]:
model.set_params(fit_intercept=False)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.338321859674915


In [62]:
# On a vu que mettre la valeur de fit_intercept à False diminue le f1 score, donc je vais la remettre.
model.set_params(fit_intercept=True)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
model.set_params(class_weight='balanced')

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.7294220452769585


In [65]:
# On a vu que le poids augmente le f1Score. On essaye maintenant d'augmenter le nombre d'itérations.
model.set_params(max_iter=4000)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=4000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.5885925404245664


In [67]:
# Avoir doubler le nombre d'itérations n'a eu aucun importance. Mme LACLAU nous a conseillé de tester le paramètre C. C doit être un bon compromis pour lutter contre
# l'underfitting et l'overfitting. Et de ne prendre qu'un seul autre modèle de classification. SVM par exemple ?
model.set_params(max_iter=2000)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
model.set_params(C=0.5)

LogisticRegression(C=0.5, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.5841838844100915


In [15]:
# Diminuer C a réduit le f1score, essayons de l'augmenter.
model.set_params(C=10)

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.7455537959317455


In [84]:
print(metrics.classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.25      0.38        20
           1       0.66      0.61      0.63        41
           2       0.73      0.73      0.73        11
           3       0.50      0.47      0.48       111
           4       1.00      0.40      0.57        10
           5       0.89      0.66      0.76        38
           6       0.61      0.85      0.71       210
           7       1.00      0.18      0.31        11
           8       0.73      0.63      0.68        57
           9       0.74      0.57      0.64        30
          10       1.00      0.18      0.31        11
          11       0.42      0.47      0.45        59
          12       0.54      0.54      0.54        24
          13       0.56      0.67      0.61        54
          14       0.61      0.71      0.66        49
          15       0.76      0.65      0.70        52
          16       0.62      0.38      0.48        13
          17       0.71    

In [88]:
model.set_params(solver='liblinear')

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [89]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)

0.5914102436779226


In [None]:
model.set_params(solver='newton-cg')

In [None]:
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
f1 = f1_score(Y_test,y_pred,average='macro')
print(f1)



In [21]:
# Creation d'un fichier csv des prédictions
predict = pd.DataFrame(X_test, y_pred).to_csv('predict.csv')

In [85]:
df_learning = pd.DataFrame({
    "Description": X_test_save,
    "Metier_Reel": Y_test,
    "Metier_Predit": y_pred,
})

In [86]:
predict = df_learning.to_csv('predict.csv',sep=',')

In [21]:
predict = pd.DataFrame(X_test, Y_test).to_csv('predict3.csv')

In [67]:
# Les tests suivant n'ont pas donnés de meilleurs résultats. Toutes les cellules suivantes sont optionnelles. Elles contiennent par contre notre test avec svm.
# Vous pouvez exécuter les deux dernières cellules si vous voulez comparer le f1 score d svm avec celui de la regréssion logistique.
# Testons pour une valeur plus grande de C.
#model.set_params(C=100)

LogisticRegression(C=100, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
#model.fit(X_train,Y_train)
#y_pred = model.predict(X_test)
#f1 = f1_score(Y_test,y_pred,average='macro')
#print(f1)

0.5748537249477804


In [19]:
# Testons pour une valeur plus grande de C.
#model.set_params(C=40)

LogisticRegression(C=40, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
#model.fit(X_train,Y_train)
#y_pred = model.predict(X_test)
#f1 = f1_score(Y_test,y_pred,average='macro')
#print(f1)

0.5936290856584262


In [32]:
# On va garder une valeur de 10 pour le paramètre C.
#model.set_params(C=10)

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=2000, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
modelSvm = svm.SVC(kernel='linear')
modelSvm.set_params(C=10)
modelSvm.set_params(class_weight='balanced')
modelSvm.fit(X_train, Y_train)
y_pred_svm = modelSvm.predict(X_test)

In [72]:
# le f1 score est inférieur à celui de la régression logistique.
f1 = f1_score(Y_test,y_pred_svm,average='macro')
print(f1)

0.5239676653528541
