In [1]:
from collections import Counter
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from nltk import word_tokenize
from nltk.corpus import stopwords

from wordcloud import WordCloud

import unidecode

In [31]:
df=pd.read_csv("../df_annually.csv")

# Création Fonction Split Salary

In [32]:
def split_salary(row):
    salary = row["Salary"]
    if "-" in salary:
        split = salary.split("-")
        salary_min = split[0]
        salary_max = split[1]
    else:
        salary_min = salary
        salary_max = salary
    
    row["salary_min"] = salary_min.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    row["salary_max"] = salary_max.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    
    if "an" in row["Salary"]:
        row["salary_period"] = "year"

    if "mois" in row["Salary"]:
        if float(row["salary_min"]) < 1500:
            row["salary_min"] = float(row["salary_min"])
            row["salary_max"] = float(row["salary_max"])
            row["salary_period"] = "month"
        else:
            row["salary_min"] = float(row["salary_min"])*12
            row["salary_max"] = float(row["salary_max"])*12
            row["salary_period"] = "year"
    
    if "semaine" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "week"

    if "jour" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "day"

    if "heure" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"].replace(",","."))
        row["salary_max"] = float(row["salary_max"].replace(",","."))
        row["salary_period"] = "hour"
        
    return row

# Création des colonnes salary max et salary min

In [33]:
stop_words = stopwords.words('french')

1. Mettre en minuscule
2. Remplacer les ponctuations (sauf '+') par des espaces : `[^\w|\s|+]` mais aussi les '|' et '\_' : `[_|\|]`
3. Remplacer les lettres accentuées par des lettres sans accents
4. Remplacer les lettres seules (sauf les lettres c et r (langages de programmation)) par des espaces : `\b[abd-qs-z]\b`
5. Remplacer les nombres qui ont 2 chiffres ou plus par des espaces : `\d{2,}`
6. Splitter la chaîne de caractères en une liste de mots
7. Créer une nouvelle liste sans les stopwords

In [34]:
def preprocessing_text(text, stopwords):
    text = text.lower()
    text = re.sub(r'[^\w|\s|+]', ' ', text)
    text = re.sub(r'[_|\|]', ' ', text)
    text = unidecode.unidecode(text)
    text = re.sub(r'\b[abd-qs-z]\b', ' ', text)
    text = re.sub(r'\d{2,}', ' ', text)

    # STOPWORDS
    tokenized_words = word_tokenize(text)
    tokenized_words = [word for word in tokenized_words if word not in stopwords]

    return tokenized_words

In [35]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/fabi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
df_salary = df[~df.Salary.isna()]

df_salary = df_salary.apply(lambda column: column.apply(preprocessing_text, args=(stop_words,))
                                          if column.name in ['Title', 'Description']
                                          else column)

In [37]:
df_salary = df_salary.apply(split_salary, axis=1)
df_salary["salary_min"]=pd.to_numeric(df_salary["salary_min"],'coerce')
df_salary["salary_max"]=pd.to_numeric(df_salary["salary_max"],'coerce')

In [38]:
df_salary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 674
Data columns (total 18 columns):
Unnamed: 0           675 non-null int64
_id                  675 non-null object
Title                675 non-null object
Company              675 non-null object
Location             675 non-null object
Salary               675 non-null object
Description          675 non-null object
Date                 546 non-null object
Job_Search           675 non-null object
Department_Search    675 non-null object
city                 675 non-null object
cp                   675 non-null object
salary_min           675 non-null float64
salary_max           675 non-null float64
salary_type          675 non-null object
salary_mean          675 non-null float64
salary_label         675 non-null int64
salary_period        675 non-null object
dtypes: float64(3), int64(2), object(13)
memory usage: 100.2+ KB


## Création colonne salary mean

In [39]:
df_salary["salary_mean"] = (df_salary["salary_min"]+df_salary["salary_max"])/2

In [40]:
df_salary

Unnamed: 0.1,Unnamed: 0,_id,Title,Company,Location,Salary,Description,Date,Job_Search,Department_Search,city,cp,salary_min,salary_max,salary_type,salary_mean,salary_label,salary_period
0,8,p_008d77a008590232,"[ingenieur, developpeur, c++, java]",ALTEN,Toulouse (31),30 000 € - 45 000 € par an,"[partenaire, technologique, reference, plus, g...",08/09/2019,Développeur,Haute-Garonne,Toulouse,31,30000.0,45000.0,annually,37500.0,0,year
1,15,p_00f2ad8db3c30b8e,[devops],Kent FR,Bordeaux (33),35 000 € - 40 000 € par an,"[kent, marque, groupagora, dediee, recrutement...",09/03/2019,Développeur,Gironde,Bordeaux,33,35000.0,40000.0,annually,37500.0,0,year
2,17,p_00fa03a21ad52a71,"[data, scientist]",QYSY,Paris (75),40 000 € - 65 000 € par an,"[qysy, cabinet, conseil, recrutementqui, propo...",29/09/2019,Data,75,Paris,75,40000.0,65000.0,annually,52500.0,3,year
3,19,p_0130bef769eef3cc,"[business, developer, btob, data, intelligence]",L.I.P,Lyon 3e (69),35 000 € - 60 000 € par an,"[lyon, 3e, arrondissement, cdi, bac+5, plus, i...",07/10/2019,Data,Rhône,Lyon,69,35000.0,60000.0,annually,47500.0,2,year
4,31,p_019d01b7494dc632,"[referent, fonctionnel, donnees, data, manager]",GARCIN FINANCES ET PARTICIPATIONS,Corbas (69),40 000 € - 45 000 € par an,"[souhaitez, participer, structuration, donnees...",10/09/2019,Data,Rhône,Corbas,69,40000.0,45000.0,annually,42500.0,1,year
5,44,p_02074799b0dbc42e,"[developpeur, php, laravel]",Sept Lieues,Paris (75),32 000 € - 37 000 € par an,"[pme, pres, ans, existence, gestion, situation...",08/04/2019,Développeur,75,Paris,75,32000.0,37000.0,annually,34500.0,0,year
6,49,p_02270c340e21e7ab,"[developpeur, windev]",Axemploi Recrutement,Nantes (44),30 000 € - 40 000 € par an,"[developpeur, windev, interviendrez, developpe...",18/09/2019,Développeur,Loire-Atlantique,Nantes,44,30000.0,40000.0,annually,35000.0,0,year
7,66,p_033110c9c03332f9,"[developpeur, java]",WOM,Lyon (69),30 000 € - 40 000 € par an,"[client, specialise, edition, logiciel, rse, r...",03/10/2019,Développeur,Rhône,Lyon,69,30000.0,40000.0,annually,35000.0,0,year
8,87,p_046189897fdb70ed,"[controleur, gestion, data, analyst]",SH Conseils,Paris 8e (75),35 000 € - 38 000 € par an,"[compte, operateur, telecoms, premier, plan, f...",18/09/2019,Data,75,Paris,75,35000.0,38000.0,annually,36500.0,0,year
9,105,p_05b4d08c39d1a13c,"[developpeur, mobile, senior, ios, swift]",EXTERNATIC,Bordeaux (33),40 000 € - 50 000 € par an,"[description, offre, externatic, hub, opportun...",07/09/2019,Développeur,Gironde,Bordeaux,33,40000.0,50000.0,annually,45000.0,1,year


# Je ne prends que les salaires qui sont 'par an' 

In [41]:
df_salary = df_salary[df_salary.salary_period == 'year']

## Quantiles salary min

In [16]:
df_salary.salary_min.describe()

count       85.000000
mean     40917.647059
std       9823.697981
min      28000.000000
25%      33000.000000
50%      38000.000000
75%      50000.000000
max      68000.000000
Name: salary_min, dtype: float64

## Quantiles salary max

In [None]:
df_salary.salary_max.describe()

## Quantiles salary mean

In [None]:
df_salary.salary_mean.describe()

In [None]:
tercile_1 = np.quantile(df_salary.salary_mean, 1/3)
tercile_1

In [None]:
tercile_2 = np.quantile(df_salary.salary_mean, 2/3)
tercile_2

# Création colonne class label par rapport aux quantiles salary mean

In [None]:
def classification(x):
    if x <= tercile_1:
        label = 1
    elif x <= tercile_2:
        label = 2
    else:
        label = 3
    return label

df_salary["salary_label"] = df_salary["salary_mean"].apply(classification)

In [None]:
df_salary.head(10)

# Création des df par tranches label

In [None]:
label_1 = df_salary[df_salary["salary_label"]==1]
label_2 = df_salary[df_salary["salary_label"]==2]
label_3 = df_salary[df_salary["salary_label"]==3]

In [None]:
df_salary.Department_Search.value_counts(normalize=True) * 100

In [None]:
df_salary.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_1.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_1.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_2.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_2.Job_Search.value_counts(normalize=True) * 100

In [None]:
label_3.Department_Search.value_counts(normalize=True) * 100

In [None]:
label_3.Job_Search.value_counts(normalize=True) * 100

In [None]:
tag_title_1 = label_1["Title"]
tag_title_2 = label_2["Title"]
tag_title_3 = label_3["Title"]

In [None]:
# créer un dictionnaire avec la frequence de chaque mot de tag_title1
# on va ordonner par la valeur du dictionnaire en ordre descendant

result_1 = tag_title_1.apply(Counter).sum().items()
result_1 = sorted(result_1, key=lambda kv : kv[1], reverse=True)

result_2 = tag_title_2.apply(Counter).sum().items()
result_2 = sorted(result_2, key=lambda kv : kv[1], reverse=True)

result_3 = tag_title_3.apply(Counter).sum().items()
result_3 = sorted(result_3, key=lambda kv : kv[1], reverse=True)

### Je crée le dictionnaire qui associe la frequece de chaque mot sur chaque df

In [None]:
result_series_1 = dict(result_1)
result_series_2 = dict(result_2)
result_series_3 = dict(result_3)

In [None]:
result_series_1

# Visualisation du wordcloud title de chaque df

In [None]:
wordcloud_1 = WordCloud(max_words=50).generate_from_frequencies(result_series_1)
wordcloud_2 = WordCloud(max_words=50).generate_from_frequencies(result_series_2)
wordcloud_3 = WordCloud(max_words=50).generate_from_frequencies(result_series_3)

In [None]:
plt.figure(figsize = (25, 30))

plt.subplot(3, 1, 1)
plt.imshow(wordcloud_1, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 2)
plt.imshow(wordcloud_2, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 3)
plt.imshow(wordcloud_3, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
tag_desc_1 = label_1["Description"]
tag_desc_2 = label_2["Description"]
tag_desc_3 = label_3["Description"]

In [None]:
tag_desc_3

# création des dictionnaires qui à chaque mot associe sa frequence

In [None]:
desc_1 = tag_desc_1.apply(Counter).sum().items()
desc_1 = sorted(desc_1, key=lambda kv : kv[1], reverse=True) 
dict_desc_1 = {k: v for k,v in desc_1}

desc_2 = tag_desc_2.apply(Counter).sum().items()
desc_2 = sorted(desc_2, key=lambda kv : kv[1], reverse=True) 
dict_desc_2 = {k: v for k,v in desc_2}

desc_3 = tag_desc_3.apply(Counter).sum().items()
desc_3 = sorted(desc_3, key=lambda kv : kv[1], reverse=True) 
dict_desc_3 = {k: v for k,v in desc_3}

# Visualisation du wordcloud job_desc pour chaque df

In [None]:
wordcloud6 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_1)
wordcloud7 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_2)
wordcloud8 = WordCloud(max_words=50).generate_from_frequencies(dict_desc_3)

figure = plt.figure(figsize = (25, 30))
plt.figure(1)
plt.subplot(3, 1, 1)
plt.imshow(wordcloud6, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 2)
plt.imshow(wordcloud7, interpolation="bilinear")
plt.axis("off")

plt.subplot(3, 1, 3)
plt.imshow(wordcloud8, interpolation="bilinear")
plt.axis("off")

plt.show()

## prediction salaire avec job desc brut seulement

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X_desc = df_salary.Description.apply(" ".join)
y = df_salary.salary_label

In [None]:
X_desc

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 4))
vectorizer.fit(X_desc)

In [None]:
len(vectorizer.get_feature_names())

In [None]:
X_desc_trans = pd.DataFrame(vectorizer.transform(X_desc).todense(), columns=vectorizer.get_feature_names())
X_desc_trans

In [None]:
import numpy as np
np.asmatrix(X_desc_trans)

In [None]:
word_counts = X_desc_trans.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.asmatrix(X_desc_trans), y, stratify=y)

In [None]:
rfc = RandomForestClassifier(n_estimators=5, random_state=42)
rfc.fit(X_train, y_train)

rfc.score(X_test,y_test)

In [None]:
rfc.score(X_train, y_train)

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_desc_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_importances.sort_values('importance', ascending=False).head(30)

# Prédiction salaire avec Title seulement

In [None]:
X_title = df_salary.Title.apply(" ".join)
y = df_salary.salary_label

In [None]:
vectorizer_2 = TfidfVectorizer(ngram_range=(1, 4))
vectorizer_2.fit(X_title)

In [None]:
len(vectorizer_2.get_feature_names())

In [None]:
X_title_trans = pd.DataFrame(vectorizer_2.transform(X_title).todense(), columns=vectorizer_2.get_feature_names())
X_title_trans

In [None]:
word_counts2 = X_title_trans.sum(axis=0)
word_counts2.sort_values(ascending = False).head(20)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(np.asmatrix(X_title_trans), y, random_state=42, stratify=y)

In [None]:
rfc = RandomForestClassifier(7, random_state=42)
rfc.fit(X_train2, y_train2)

rfc.score(X_test2,y_test2)

In [None]:
rfc.score(X_train2,y_train2)

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_title_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_importances.sort_values('importance', ascending=False).head(20)