In [None]:
from collections import Counter
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from nltk import word_tokenize
from nltk.corpus import stopwords

from wordcloud import WordCloud

import unidecode

In [None]:
df=pd.read_csv("salary_indeed.csv")

# Création Fonction Split Salary

In [None]:
def split_salary(row):
    salary = row["Salary"]
    if "-" in salary:
        split = salary.split("-")
        salary_min = split[0]
        salary_max = split[1]
    else:
        salary_min = salary
        salary_max = salary
    
    row["salary_min"] = salary_min.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    row["salary_max"] = salary_max.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    
    if "an" in row["Salary"]:
        row["salary_period"] = "year"

    if "mois" in row["Salary"]:
        if float(row["salary_min"]) < 1500:
            row["salary_min"] = float(row["salary_min"])
            row["salary_max"] = float(row["salary_max"])
            row["salary_period"] = "month"
        else:
            row["salary_min"] = float(row["salary_min"])*12
            row["salary_max"] = float(row["salary_max"])*12
            row["salary_period"] = "year"
    
    if "semaine" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "week"

    if "jour" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "day"

    if "heure" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"].replace(",","."))
        row["salary_max"] = float(row["salary_max"].replace(",","."))
        row["salary_period"] = "hour"
        
    return row

# Création des colonnes salary max et salary min

In [None]:
stop_words = stopwords.words('french')

1. Mettre en minuscule
2. Remplacer les ponctuations (sauf '+') par des espaces : `[^\w|\s|+]` mais aussi les '|' et '\_' : `[_|\|]`
3. Remplacer les lettres accentuées par des lettres sans accents
4. Remplacer les lettres seules (sauf les lettres c et r (langages de programmation)) par des espaces : `\b[abd-qs-z]\b`
5. Remplacer les nombres qui ont 2 chiffres ou plus par des espaces : `\d{2,}`
6. Splitter la chaîne de caractères en une liste de mots
7. Créer une nouvelle liste sans les stopwords

In [None]:
def preprocessing_text(text, stopwords):
    text = text.lower()
    text = re.sub(r'[^\w|\s|+]', ' ', text)
    text = re.sub(r'[_|\|]', ' ', text)
    text = unidecode.unidecode(text)
    text = re.sub(r'\b[abd-qs-z]\b', ' ', text)
    text = re.sub(r'\d{2,}', ' ', text)

    # STOPWORDS
    tokenized_words = word_tokenize(text)
    tokenized_words = [word for word in tokenized_words if word not in stopwords]

    return tokenized_words

In [None]:
df_salary = df[~df.Salary.isna()]

df_salary = df_salary.apply(lambda column: column.apply(preprocessing_text, args=(stop_words,))
                                          if column.name in ['Title', 'Description']
                                          else column)

In [None]:
df_salary = df_salary.apply(split_salary, axis=1)
df_salary["salary_min"]=pd.to_numeric(df_salary["salary_min"],'coerce')
df_salary["salary_max"]=pd.to_numeric(df_salary["salary_max"],'coerce')

In [None]:
df_salary.info()

## Création colonne salary mean

In [None]:
df_salary["salary_mean"] = (df_salary["salary_min"]+df_salary["salary_max"])/2

In [None]:
df_salary

# Je ne prends que les salaires qui sont 'par an' 

In [None]:
df_salary = df_salary[df_salary.salary_period == 'year']

## Quantiles salary min

In [None]:
df_salary.salary_min.describe()

## Quantiles salary max

In [None]:
df_salary.salary_max.describe()

## Quantiles salary mean

In [None]:
df_salary.salary_mean.describe()

In [None]:
tercile_1 = np.quantile(df_salary.salary_mean, 1/3)
tercile_1

In [None]:
tercile_2 = np.quantile(df_salary.salary_mean, 2/3)
tercile_2

# Création colonne class label par rapport aux quantiles salary mean

In [None]:
def classification(x):
    if x <= tercile_1:
        label = 1
    elif x <= tercile_2:
        label = 2
    else:
        label = 3
    return label

df_salary["salary_label"] = df_salary["salary_mean"].apply(classification)

In [None]:
df_salary.head(10)

# Création des df par tranches label

In [None]:
label_1 = df_salary[df_salary["salary_label"]==1]
label_2 = df_salary[df_salary["salary_label"]==2]
label_3 = df_salary[df_salary["salary_label"]==3]

In [None]:
df_salary.Department_Search.value_counts(normalize=True) * 100

In [None]:
df_salary.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_1.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_1.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_2.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_2.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_3.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_3.Job_Search.value_counts(normalize=True) * 100

In [None]:
tag_title_1 = label_1["Title"]
tag_title_2 = label_2["Title"]
tag_title_3 = label_3["Title"]

In [None]:
# créer un dictionnaire avec la frequence de chaque mot de tag_title1
# on va ordonner par la valeur du dictionnaire en ordre descendant

result_1 = tag_title_1.apply(Counter).sum().items()
result_1 = sorted(result_1, key=lambda kv : kv[1], reverse=True)

result_2 = tag_title_2.apply(Counter).sum().items()
result_2 = sorted(result_2, key=lambda kv : kv[1], reverse=True)

result_3 = tag_title_3.apply(Counter).sum().items()
result_3 = sorted(result_3, key=lambda kv : kv[1], reverse=True)

### Je crée le dictionnaire qui associe la frequece de chaque mot sur chaque df

In [None]:
result_series_1 = dict(result_1)
result_series_2 = dict(result_2)
result_series_3 = dict(result_3)

In [None]:
result_series_1

# Visualisation du wordcloud title de chaque df

In [None]:
wordcloud_1 = WordCloud(max_words=50).generate_from_frequencies(result_series_1)
wordcloud_2 = WordCloud(max_words=50).generate_from_frequencies(result_series_2)
wordcloud_3 = WordCloud(max_words=50).generate_from_frequencies(result_series_3)

In [None]:
plt.figure(figsize = (25, 30))

plt.subplot(3, 1, 1)
plt.imshow(wordcloud_1, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 2)
plt.imshow(wordcloud_2, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 3)
plt.imshow(wordcloud_3, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
tag_desc_1 = label_1["Description"]
tag_desc_2 = label_2["Description"]
tag_desc_3 = label_3["Description"]

In [None]:
tag_desc_3

# création des dictionnaires qui à chaque mot associe sa frequence

In [None]:
desc_1 = tag_desc_1.apply(Counter).sum().items()
desc_1 = sorted(desc_1, key=lambda kv : kv[1], reverse=True) 
dict_desc_1 = {k: v for k,v in desc_1}

desc_2 = tag_desc_2.apply(Counter).sum().items()
desc_2 = sorted(desc_2, key=lambda kv : kv[1], reverse=True) 
dict_desc_2 = {k: v for k,v in desc_2}

desc_3 = tag_desc_3.apply(Counter).sum().items()
desc_3 = sorted(desc_3, key=lambda kv : kv[1], reverse=True) 
dict_desc_3 = {k: v for k,v in desc_3}

# Visualisation du wordcloud job_desc pour chaque df

In [None]:
wordcloud6 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_1)
wordcloud7 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_2)
wordcloud8 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_3)

figure = plt.figure(figsize = (25, 30))
plt.figure(1)
plt.subplot(3, 1, 1)
plt.imshow(wordcloud6, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 2)
plt.imshow(wordcloud7, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 3)
plt.imshow(wordcloud8, interpolation="bilinear")
plt.axis("off")

plt.show()

## prediction salaire avec job desc brut seulement

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X_desc = df_salary.Description.apply(" ".join)
y = df_salary.salary_label

In [None]:
X_desc

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 4))
vectorizer.fit(X_desc)

In [None]:
len(vectorizer.get_feature_names())

In [None]:
X_desc_trans = pd.DataFrame(vectorizer.transform(X_desc).todense(), columns=vectorizer.get_feature_names())
X_desc_trans

In [None]:
import numpy as np
np.asmatrix(X_desc_trans)

In [None]:
word_counts = X_desc_trans.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.asmatrix(X_desc_trans), y, stratify=y)

In [None]:
rfc = RandomForestClassifier(n_estimators=5, random_state=42)
rfc.fit(X_train, y_train)

rfc.score(X_test,y_test)

In [None]:
rfc.score(X_train, y_train)

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_desc_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_importances.sort_values('importance', ascending=False).head(30)

# Prédiction salaire avec Title seulement

In [None]:
X_title = df_salary.Title.apply(" ".join)
y = df_salary.salary_label

In [None]:
vectorizer_2 = TfidfVectorizer(ngram_range=(1, 4))
vectorizer_2.fit(X_title)

In [None]:
len(vectorizer_2.get_feature_names())

In [None]:
X_title_trans = pd.DataFrame(vectorizer_2.transform(X_title).todense(), columns=vectorizer_2.get_feature_names())
X_title_trans

In [None]:
word_counts2 = X_title_trans.sum(axis=0)
word_counts2.sort_values(ascending = False).head(20)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(np.asmatrix(X_title_trans), y, random_state=42, stratify=y)

In [None]:
rfc = RandomForestClassifier(7, random_state=42)
rfc.fit(X_train2, y_train2)

rfc.score(X_test2,y_test2)

In [None]:
rfc.score(X_train2,y_train2)

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_title_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_importances.sort_values('importance', ascending=False).head(20)