In [None]:
import re

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from xgboost import XGBClassifier

from nltk import word_tokenize
from nltk.corpus import stopwords

import unidecode

In [None]:
df = pd.read_csv("salary_indeed.csv")

# Split Salary

In [None]:
def split_salary(row):
    salary = row["Salary"]
    if "-" in salary:
        split = salary.split("-")
        salary_min = split[0]
        salary_max = split[1]
    else:
        salary_min = salary
        salary_max = salary
    
    row["salary_min"] = salary_min.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    row["salary_max"] = salary_max.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    
    if "an" in row["Salary"]:
        row["salary_period"] = "year"

    if "mois" in row["Salary"]:
        if float(row["salary_min"]) < 1500:
            row["salary_min"] = float(row["salary_min"])
            row["salary_max"] = float(row["salary_max"])
            row["salary_period"] = "month"
        else:
            row["salary_min"] = float(row["salary_min"])*12
            row["salary_max"] = float(row["salary_max"])*12
            row["salary_period"] = "year"
    
    if "semaine" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "week"

    if "jour" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "day"

    if "heure" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"].replace(",","."))
        row["salary_max"] = float(row["salary_max"].replace(",","."))
        row["salary_period"] = "hour"
        
    return row

# Preprocess text

In [None]:
stop_words = stopwords.words('french')

1. Mettre en minuscule
2. Remplacer les ponctuations (sauf '+') par des espaces : `[^\w|\s|+]` mais aussi les '|' et '\_' : `[_|\|]`
3. Remplacer les lettres accentuées par des lettres sans accents
4. Remplacer les lettres seules (sauf les lettres c et r (langages de programmation)) par des espaces : `\b[abd-qs-z]\b`
5. Remplacer les nombres qui ont 2 chiffres ou plus par des espaces : `\d{2,}`
6. Splitter la chaîne de caractères en une liste de mots
7. Créer une nouvelle liste sans les stopwords

In [None]:
def preprocessing_text(text, stopwords, prefix=''):
    if prefix:
        prefix = prefix.lower() + '_'

    text = text.lower()
    text = re.sub(r'[^\w|\s|+]', ' ', text)
    text = re.sub(r'[_|\|]', ' ', text)
    text = unidecode.unidecode(text)
    text = re.sub(r'\b[abd-qs-z]\b', ' ', text)
    text = re.sub(r'\d{2,}', ' ', text)

    # STOPWORDS
    tokenized_words = word_tokenize(text)
    tokenized_words = [prefix + word for word in tokenized_words if word not in stopwords]

    return " ".join(tokenized_words)

In [None]:
df_salary = df[~df.Salary.isna()]

In [None]:
#df_salary = df_salary.apply(lambda column: column.apply(preprocessing_text, args=(stop_words,))
#                                          if column.name in ['Title', 'Description']
#                                          else column)

In [None]:
for column in ['Title', 'Description']:
    col_lower = column.lower()
    new_col = col_lower + '_clean'
    
    df_salary[new_col] = df_salary[column].apply(preprocessing_text,
                                                 stopwords=stop_words,
                                                 prefix=col_lower)

In [None]:
df_salary = df_salary.apply(split_salary, axis=1)
df_salary["salary_min"] = pd.to_numeric(df_salary["salary_min"])
df_salary["salary_max"] = pd.to_numeric(df_salary["salary_max"])

In [None]:
df_salary.info()

## Création colonne salary mean

In [None]:
df_salary["salary_mean"] = (df_salary["salary_min"] + df_salary["salary_max"]) / 2

In [None]:
df_salary

# Je ne prends que les salaires qui sont 'par an' 

In [None]:
df_salary = df_salary[df_salary.salary_period == 'year']

## Quantiles salary mean

In [None]:
df_salary.salary_mean.describe()

In [None]:
tercile_1 = np.quantile(df_salary.salary_mean, 1/3)
tercile_1

In [None]:
tercile_2 = np.quantile(df_salary.salary_mean, 2/3)
tercile_2

# Création colonne class label par rapport aux quantiles salary mean

In [None]:
#def classification(x):
#    if x <= tercile_1:
#        label = 1
#    elif x <= tercile_2:
#        label = 2
#    else:
#        label = 3
#    return label

#df_salary["salary_label"] = df_salary["salary_mean"].apply(classification)

In [None]:
df_salary = df_salary[(df_salary.salary_mean > 28000)
                      & (df_salary.salary_mean <= 65000)]

def classification(x):
    if x <= 36000.0:
        label = 0
    elif x <= 42000.0:
        label = 1
    elif x <= 47000.0:
        label = 2
    elif x <= 53000.0:
        label = 3
    else:
        label = 4
    return label

df_salary["salary_label"] = df_salary["salary_mean"].apply(classification)

In [None]:
df_salary.head()

In [None]:
X_title = df_salary.title_clean

In [None]:
vectorizer = TfidfVectorizer(token_pattern='\S+', ngram_range=(1, 3), max_features=2500)
X_title = vectorizer.fit_transform(X_title)

In [None]:
len(vectorizer.get_feature_names())

In [None]:
X_title_df = pd.DataFrame(X_title.todense(), columns=vectorizer.get_feature_names())
X_title_df

In [None]:
word_counts = X_title_df.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

In [None]:
X_desc = df_salary.description_clean

In [None]:
vectorizer = TfidfVectorizer(token_pattern='\S+', ngram_range=(1, 3), max_features=2500)
X_desc = vectorizer.fit_transform(X_desc)

In [None]:
len(vectorizer.get_feature_names())

In [None]:
X_desc_df = pd.DataFrame(X_desc.todense(), columns=vectorizer.get_feature_names())
X_desc_df

In [None]:
word_counts = X_desc_df.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

In [None]:
X_dpt_df = pd.get_dummies(df_salary.Department_Search).reset_index(drop=True)

In [None]:
X_dpt_df

In [None]:
X = pd.concat([X_title_df, X_desc_df, X_dpt_df], axis=1)
y = df_salary.salary_label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
logreg = LogisticRegression(solver='saga', n_jobs=-1)

In [None]:
params = {'penalty' : ['l1', 'l2'],
          'C' : np.logspace(-3, 3, 7),
          'fit_intercept': [True, False]}

logreg_gs = GridSearchCV(estimator=logreg, param_grid=params, n_jobs=-1, cv=3)

logreg_gs.fit(X_train, y_train)

In [None]:
logreg_gs.best_score_

In [None]:
svm = SVC()

In [None]:
params = [{'kernel': 'rbf',
          'gamma': np.logspace(-2, 2, 5),
          'C': [0.01, 0.1, 1, 10, 100, 1000]},
          {'kernel': 'linear',
           'C': [0.01, 0.1, 1, 10, 100, 1000]}]

svm_gs = GridSearchCV(estimator=svm, param_grid=params, n_jobs=-1, cv=3)

svm_gs.fit(X_train, y_train)

In [None]:
svm_gs.best_params_

In [None]:
svm_gs.best_score_

In [None]:
forest = RandomForestClassifier()

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(forest, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
bestF.best_params_

In [None]:
bestF.score(X_test, y_test)

In [None]:
xgb = XGBClassifier(objective='multi:softmax', n_jobs=-1)

In [None]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 2, 5],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'max_depth': [2, 3, 5]
        }

In [None]:
xgb_gs = GridSearchCV(estimator=xgb, param_grid=params, n_jobs=-1, cv=3)

In [None]:
xgb_gs.fit(X_train, y_train)

In [None]:
xgb_gs.best_params_

xgb_gs.best_score_

xgb_gs.score(X_test, y_test)

xgb_gs.score(X_train, y_train)

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=2,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='multi:softmax', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1.0, verbosity=1)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
feature_importances = pd.DataFrame(data={'feature': X.columns,
                                         'importance': xgb.feature_importances_})

feature_importances.sort_values('importance', ascending=False).head(30)

In [None]:
xgb.score(X_test, y_test)

In [None]:
xgb.score(X_train, y_train)