# LIBRAIRIES

In [1]:
import datetime
import re
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException

from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

import unidecode

In [2]:
df_salary=pd.read_csv("/Users/fabi/Documents/Data AI Ecouen/PROJECTS/salary_indeed.csv")

In [3]:
df_salary.shape

(4884, 9)

In [4]:
df_salary.head()

Unnamed: 0,_id,Title,Company,Location,Salary,Description,Date,Job_Search,Department_Search
0,p_00198c04b7c86195,Développeuse / Développeur Stack Elastic Search,LexisNexis,Paris (75),,Développeur Sénior Java Elastic Search Respon...,07/09/2019,Développeur,75
1,p_0020213df500f7d8,Développeur Front End F/H,KP2I,Paris 1er (75),,Dans le cadre de la refonte du site e-commerce...,01/10/2019,Développeur,75
2,p_0030c303451b137b,Data Analyst (H/F) - Alternance,Nexity,Paris 8e (75),,Job Description Dans le cadre de son programm...,24/09/2019,Data,75
3,p_003ef0f6960ff453,"INTERNSHIP POSITION, Evaluation of the aircraf...",Airbus,Toulouse (31),,AIRBUS SAS Airbus is a global leader in aerona...,25/09/2019,Data,Haute-Garonne
4,p_005290833c76d0fe,Des Développeurs Java JEE (H/F),FrançAsie,Nantes (44),,FrançAsie recrute pour notre client une Sociét...,07/09/2019,Développeur,Loire-Atlantique


# PRE-PROCESSING

### DATA RECOVERY

#### EXPERIENCES

In [1]:
#import re
#import numpy as np
#import pandas as pd
#from nltk import word_tokenize
#from nltk.corpus import stopwords
#import unidecode

#stopwords
stop_words = stopwords.words('french')

def preprocessing_xp(text, stopwords):
    text = text.lower()
    text = re.sub(r'bac\s?\+\s?\d\/?\d?', ' ', text)
    text = re.sub(r'[^\w|\s|+]', ' ', text) #take off punctuation sign, keep space and plus sign
    text = re.sub(r'[_|\|]', ' ', text) #take off _ and |
    text = unidecode.unidecode(text) #take off accent on letters
    text = re.sub(r'\b[abd-qs-z]\b', ' ', text) #for single letters : keep only C and R (programming languages)
    text = re.sub(r'\d{3,}|[2-9][0-9]|\b0+\b', ' ', text) #take off number from 3 character (100 and more), take off number between 20-99, and all number with only 0 (000)
    
    # STOPWORDS
    tokenized_words = word_tokenize(text) #put words in a list without space
    tokenized_words = [word for word in tokenized_words if word not in stopwords] 

    return " ".join(tokenized_words) #put the list back as phrases

#call preprocess skills functions
df_salary["description_clean"] = df_salary.Description.apply(preprocessing_xp, args=(stop_words,))

#keep 3 words before "annee(s)" --- annee(s) experience(s)
df_salary["experience_left"] = df_salary.description_clean\
                                    .apply(lambda x: re.findall(r'((?:\S+\s+){0,3}\ban(?:nee)?s?\b experience)', x))

#keep everything between : experience(s) annee(s), limit 150 characters
df_salary["experience_right"] = df_salary.description_clean\
                                    .apply(lambda x: re.findall(r'experience.{,150}?\ban(?:nee)?s?\b', x))

#create list of numbers (experience_left)
df_salary["experience_left_list"] = df_salary.experience_left\
                                        .apply(lambda x: re.findall(r'\b\d+\b', repr(x)))

#create list of numbers (experience_right)
df_salary["experience_right_list"] = df_salary.experience_right\
                                            .apply(lambda x: re.findall(r'\b\d+\b', repr(x)))

#link left-right columns
df_salary["experience_total"] = (df_salary["experience_left_list"]
                                        + df_salary["experience_right_list"])

#b convert column in float and if empty put NaN in
df_salary["experience_clean"] = df_salary.experience_total\
                                    .apply(lambda y: np.array(y).astype(np.float)
                                           if len(y)>0 else np.nan)

# create min and max experiences columns
df_salary["experience_min"] = df_salary["experience_clean"].apply(np.amin)
df_salary["experience_max"] = df_salary["experience_clean"].apply(np.amax)

df_salary = df_salary.drop(['description_clean', 'experience_left',
                                    'experience_right', 'experience_left_list',
                                    'experience_right_list', 'experience_total',
                                    'experience_clean'], axis=1)

NameError: name 'stopwords' is not defined

#### QUALIFICATIONS

In [6]:
df_salary["BTS"] = df_salary.Description.str.lower().str.contains(r'\bbts\b', regex=True).astype(int)

df_salary["Deug"] = df_salary.Description.str.lower().str.contains(r'\bdeug\b|\bniveaux?iii+\b|\bbac[\s\S]?[+]?[\s\S]2\b', regex=True).astype(int)

df_salary["DUT"] = df_salary.Description.str.lower().str.contains(r'\bdut\b', regex=True).astype(int)

df_salary["IUT"] =df_salary.Description.str.lower().str.contains(r'\biut\b', regex=True).astype(int)

df_salary["Licence"] = df_salary.Description.str.lower().str.contains(r'\blicence\b|\bniveaux?ii+\b|\bbac[\s\S]?[+]?[\s\S]3\b|\b\(?bac\s?\+\s?2\s?[\/àeo][tu]?\s?(bac)?\s?\+?\s?5\)?\b|(\b\(?bac\s?\+\s?2\s?[\/àeo][tu]?\s?(bac)?\s?\+?\s?3\)?\b)', regex=True).astype(int)

df_salary["Master"] = df_salary.Description.str.lower().str.contains(r'\bmaster\b|\bniveaux?i+\b|\bbac[\s\S]?[+]?[\s\S]5\b|\bmba\b|(\bbac\s?\+\s?4\s?[\/à]\s?(bac)?\+?5\b)|(\bbac\s?\+\s?4\s?(ou)\s?(bac)?\s?\+?\s?5\b)', regex=True).astype(int)

df_salary["Doctorat"] = df_salary.Description.str.lower().str.contains(r'\bdoctorat\b|\bbac[\s\S]?[+]?[\s\S]8\b', regex=True).astype(int)

df_salary["Ecole_Ingénieure"] = df_salary.Description.str.lower().str.contains(r'\b[eé]cole\sing[ée]nieur[e]?\b|\bing[ée]nieur[e]\b', regex=True).astype(int)

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


#### CONTRACTS

In [7]:
#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import nltk
#from nltk import word_tokenize
#from nltk.corpus import stopwords
#import re

def type_de_contrat(row):
    title = row["Title"].lower()
    desc = row["Description"].lower()
    #search for CDI in title and desc
    if bool(re.search('cdi',title)):
        row["cdi"]=1
    elif bool(re.search('cdi',desc)) and not bool(re.search('stage',title)):
        row["cdi"]=1
    else:
        row["cdi"]=0
    #search for stage in title and desc
    if bool(re.search(r'stage|intern',title)):
        row["stage"]=1 
    else:
        row["stage"]=0
    #search for freelance in title and desc
    if bool(re.search('freelance',title)):
        row["freelance"]=1
    elif bool(re.search(r'type\sd(\\?\'emploi\s?|e\scontrat):?.{0,26}freelance',desc)) and not bool(re.search('freelance',title)):
        row["freelance"]=1
    else:
        row["freelance"]=0
    #search for CDD in title and desc
    if bool(re.search('cdd',title)):
        row["cdd"]=1
    elif bool(re.search(r'type\sd(\\?\'emploi\s?|e\scontrat):?.{0,26}cdd',desc)) and not bool(re.search('cdd',title)):
        row["cdd"]=1 
    else:
        row["cdd"]=0
        
    return row

#apply function on dataframe
df_salary= df_salary.apply(type_de_contrat, axis=1)

#get no restriction on table vizualization on jupyter notebook
pd.set_option('display.max_columns', None)

#### COMPETENCES & KEYS WORDS

1. Mettre en minuscule
2. Remplacer les ponctuations (sauf '+') par des espaces : `[^\w|\s|+]` mais aussi les '|' et '\_' : `[_|\|]`
3. Remplacer les lettres accentuées par des lettres sans accents
4. Remplacer les lettres seules (sauf les lettres c et r (langages de programmation)) par des espaces : `\b[abd-qs-z]\b`
5. Remplacer les nombres qui ont 2 chiffres ou plus par des espaces : `\d{2,}`
6. Splitter la chaîne de caractères en une liste de mots
7. Créer une nouvelle liste sans les stopwords
8. Retourne la liste de mots en chaîne de caractères

In [8]:
def preprocessing_skills(text, stopwords, prefix=''):
    if prefix:
        prefix = prefix.lower() + '_'

    text = text.lower()
    text = re.sub(r'[^\w|\s|+]', ' ', text)
    text = re.sub(r'[_|\|]', ' ', text)
    text = unidecode.unidecode(text)
    text = re.sub(r'\b[abd-qs-z]\b', ' ', text)
    text = re.sub(r'\d{2,}', ' ', text)

    # STOPWORDS
    tokenized_words = word_tokenize(text)
    tokenized_words = [prefix + word for word in tokenized_words if word not in stopwords]

    return " ".join(tokenized_words)

#call preprocess skills functions
df_salary["description_clean"] = df_salary.Description.apply(preprocessing_skills, args=(stop_words,))


df_salary['Python'] =  df_salary.description_clean.str.contains(r'python', regex=True).astype(int)
df_salary['Java'] = df_salary.description_clean.str.contains(r'java', regex=True).astype(int)
df_salary['Machine Learning'] = df_salary.description_clean.str.contains(r'machine learning|scikit learn|sklearn', regex=True).astype(int)
df_salary['Deep Learning'] = df_salary.description_clean.str.contains(r'deep learning|keras', regex=True).astype(int)
df_salary['Javascript'] = df_salary.description_clean.str.contains(r'javascript|angular|react|js|jquery', regex=True).astype(int)
df_salary['Swift'] = df_salary.description_clean.str.contains(r'swift|ios', regex=True).astype(int)
df_salary['NoSql'] = df_salary.description_clean.str.contains(r'mongodb|no sql|nosql', regex=True).astype(int)
df_salary['SQL'] = df_salary.description_clean.str.contains(r'\bsql\b|mysql|postgresql', regex=True).astype(int)
df_salary['Agile'] = df_salary.description_clean.str.contains(r'agile|scrum', regex=True).astype(int)
df_salary['J2ee'] = df_salary.description_clean.str.contains(r'j2ee', regex=True).astype(int)
df_salary['Jee'] = df_salary.description_clean.str.contains(r'jee', regex=True).astype(int)
df_salary['Ruby'] = df_salary.description_clean.str.contains(r'ruby|rails', regex=True).astype(int)

df_salary['HTML_CSS'] = df_salary.description_clean.str.contains(r'html|css', regex=True).astype(int)

df_salary['Php'] = df_salary.description_clean.str.contains(r'php|symfony', regex=True).astype(int)
df_salary['Spark'] = df_salary.description_clean.str.contains(r'spark', regex=True).astype(int)
df_salary['Big_data'] = df_salary.description_clean.str.contains(r'bigdata|big data', regex=True).astype(int)
df_salary['Scala'] = df_salary.description_clean.str.contains(r'scala', regex=True).astype(int)
df_salary['Back_end'] = df_salary.description_clean.str.contains(r'back end|backend', regex=True).astype(int)
df_salary['Git'] = df_salary.description_clean.str.contains(r'git|github', regex=True).astype(int)
df_salary['R'] = df_salary.description_clean.str.contains(r'\br\b', regex=True).astype(int)
df_salary['Powerbi'] = df_salary.description_clean.str.contains(r'powerbi|power bi', regex=True).astype(int)
df_salary['Cloud'] = df_salary.description_clean.str.contains(r'googlecloud|google cloud|aws|azure|\bcloud\b', regex=True).astype(int)
df_salary['C'] = df_salary.description_clean.str.contains(r'\bc\b|\bc+\b', regex=True).astype(int)
df_salary['Docker'] = df_salary.description_clean.str.contains(r'\bdocker\b', regex=True).astype(int)
df_salary['ABAP'] = df_salary.description_clean.str.contains(r'\bsap\b|\babap\b', regex=True).astype(int)
df_salary['WLang'] = df_salary.description_clean.str.contains(r'windev|webdev', regex=True).astype(int)

In [9]:
df_salary.shape

(4884, 50)

### DATA ORGANIZATION

In [10]:
#import pandas as pd
#import numpy as np
#from yellowbrick.cluster import KElbowVisualizer
#from sklearn.cluster import KMeans
#from sklearn.cluster import DBSCAN
#from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import train_test_split
#from sklearn.decomposition import PCA
#from sklearn.model_selection import GridSearchCV
#import nltk
#from nltk import word_tokenize
#from nltk.corpus import stopwords
#import matplotlib.pyplot as plt

In [11]:
df_salary.shape

(4884, 50)

### Localisation

#### Split_city function creation

On sépare la ville et le cp. On remplace les differents arrondisements de Paris et Lyon par la ville. On remplis les valeurs manquantes par le départment de recherche

In [12]:
def split_city(row):
    location = row["Location"]
    if "Paris" in location or location == "France":
        city = "Paris"
        code_postal = "75"
    elif "Lyon" in location:
        city = "Lyon"
        code_postal = "69"
    elif "(" in location:
        split = location.split("(")
        city = split[0]
        code_postal = split[1]
    else:
        city = row["Department_Search"]
        code_postal="None"
    
    row["city"] = city
    row["cp"] = code_postal.replace(")","")
    return row

df_salary = df_salary.apply(split_city, axis=1)

### Salaires

#### Split_salary function creation (salary min, max, mean)

Création des colonnes salaire_max, salaire_min et type de salaire. On transforme en annuelle seulement les valeurs en mois qui sont supérieur à 1500 smic brut + création de la colonne type de salaire (annuelle, mensuelle, hebdomadaire et par heure)

In [13]:
def split_salary(row):
    salary = row["Salary"]
    if "-" in salary:
        split = salary.split("-")
        salary_min = split[0]
        salary_max = split[1]
    else:
        salary_min = salary
        salary_max = salary
    
    row["salary_min"] = salary_min.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    row["salary_max"] = salary_max.replace("€","")\
                                  .replace("par an","")\
                                  .replace("par mois","")\
                                  .replace("par semaine","")\
                                  .replace("par jour","")\
                                  .replace("par heure","")\
                                  .replace("\xa0","")
    
    if "an" in row["Salary"]:
        row["salary_period"] = "year"

    if "mois" in row["Salary"]:
        if float(row["salary_min"]) < 1500:
            row["salary_min"] = float(row["salary_min"])
            row["salary_max"] = float(row["salary_max"])
            row["salary_period"] = "month"
        else:
            row["salary_min"] = float(row["salary_min"])*12
            row["salary_max"] = float(row["salary_max"])*12
            row["salary_period"] = "year"
    
    if "semaine" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])
        row["salary_max"] = float(row["salary_max"])
        row["salary_period"] = "week"

    if "jour" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])*5
        row["salary_max"] = float(row["salary_max"])*5
        row["salary_period"] = "week"

    if "heure" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"].replace(",","."))*35
        row["salary_max"] = float(row["salary_max"].replace(",","."))*35
        row["salary_period"] = "week"
        
    return row

df_salary = df_salary[~df_salary.Salary.isna()]

df_salary = df_salary.apply(split_salary, axis=1)
df_salary["salary_min"]=pd.to_numeric(df_salary["salary_min"],'coerce')
df_salary["salary_max"]=pd.to_numeric(df_salary["salary_max"],'coerce')
df_salary["salary_mean"] = (df_salary["salary_min"]+df_salary["salary_max"])/2

In [14]:
print("Nombre de lignes avec salaire en années : {}".format(df_salary[df_salary["salary_period"]=="year"].shape[0]))
print("Nombre de lignes avec salaire en mois : {}".format(df_salary[df_salary["salary_period"]=="month"].shape[0]))
print("Nombre de lignes avec salaire en semaine : {}".format(df_salary[df_salary["salary_period"]=="week"].shape[0]))

Nombre de lignes avec salaire en années : 675
Nombre de lignes avec salaire en mois : 30
Nombre de lignes avec salaire en semaine : 39


In [15]:
df_salary[df_salary["salary_period"]=="month"] # visualisation lignes saliare mensuelle

Unnamed: 0,_id,Title,Company,Location,Salary,Description,Date,Job_Search,Department_Search,experience_min,experience_max,BTS,Deug,DUT,IUT,Licence,Master,Doctorat,Ecole_Ingénieure,cdi,stage,freelance,cdd,description_clean,Python,Java,Machine Learning,Deep Learning,Javascript,Swift,NoSql,SQL,Agile,J2ee,Jee,Ruby,HTML_CSS,Php,Spark,Big_data,Scala,Back_end,Git,R,Powerbi,Cloud,C,Docker,ABAP,WLang,city,cp,salary_min,salary_max,salary_period,salary_mean
339,p_11932114a525d757,STAGE Chef de projet - Customer Data Managemen...,Danone,Limonest (69),1 120 € par mois,Danone recherche un(e) STAGE Chef de projet - ...,06/10/2019,Data,Rhône,,,0,0,0,0,0,1,0,0,0,1,0,0,danone recherche stage chef projet customer da...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,Limonest,69,1120.0,1120.0,month,1120.0
358,p_13477b94e2936148,Stage software et Data Engineer,Outmind,Paris 8e (75),600 € - 1 200 € par mois,À propos Outmind est une jeune startup créant ...,07/09/2019,Data,75,2.0,6.0,0,0,0,0,0,1,0,0,0,1,0,0,propos outmind jeune startup creant moteur rec...,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,Paris,75,600.0,1200.0,month,900.0
599,p_21fa7fcf997add98,Stagiaire Stage Développeur/se web FRONT ou BACK,ITALIC,Paris 10e (75),700 € - 900 € par mois,"Récapitulatif du poste Le studio ITALIC , créé...",18/09/2019,Développeur,75,1.0,1.0,1,1,1,0,1,0,0,0,0,1,0,0,recapitulatif poste studio italic cree produit...,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,Paris,75,700.0,900.0,month,800.0
721,p_283edb6188ab0e56,Analyste développeur H/F,TMDS,Irigny (69),600 € - 1 000 € par mois,Stage QA Tester Pourquoi venir chez nous ? Tot...,04/10/2019,Développeur,Rhône,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,stage qa tester pourquoi venir chez total arke...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,Irigny,69,600.0,1000.0,month,800.0
855,p_2f2cf27029d2954b,STAGIAIRE DEVELOPPEUR FRONT-END (H/F),LEOO,Paris (75),560 € par mois,"SOCIETE LEOO, filiale du groupe ADLPerformance...",07/09/2019,Développeur,75,2.0,2.0,1,1,1,0,1,0,0,0,0,0,0,0,societe leoo filiale groupe adlperformance lea...,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,Paris,75,560.0,560.0,month,560.0
895,p_31d61797124bb736,Data Analyst - Stage,Pretto,Paris (75),800 € - 1 200 € par mois,Dans une startup qui croît et se structure à t...,05/10/2019,Data,75,,,0,0,0,0,0,0,0,0,0,1,0,0,startup croit structure toute vitesse role pol...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Paris,75,800.0,1200.0,month,1000.0
1305,p_4adf45bdd9431ed7,"Data engineer (Oracle, SQL) / Freelance",Thiveo,Paris (75),600 € par mois,Mettre en uvre des solutions techniques sécur...,07/09/2019,Data,75,,,0,0,0,0,0,0,0,0,0,0,1,0,mettre uvre solutions techniques securisees au...,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Paris,75,600.0,600.0,month,600.0
1377,p_4f36493edf4833b5,Stage - Développeur ASP.NET / C# (H/F),Aidimpact,Toulouse (31),580 € par mois,Développeur C# .NET Dans le cadre de notre cro...,27/09/2019,Développeur,Haute-Garonne,2.0,2.0,0,0,0,0,0,0,0,0,0,1,0,0,developpeur net cadre croissance augmentation ...,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Toulouse,31,580.0,580.0,month,580.0
1402,p_50fd489da08f1f55,Développeur Web (stage) H/F,Blent.ai,Paris 3e (75),700 € par mois,Blent.ai est une startup spécialisée dans la...,07/10/2019,Développeur,75,3.0,3.0,0,0,0,0,0,0,0,0,0,1,0,0,blent startup spe cialise formation tiers data...,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,Paris,75,700.0,700.0,month,700.0
1518,p_57f4a3ee1f4b1fd5,stage ingénieur - développeur,Certilience,Limonest (69),600 € - 850 € par mois,Résumé Environnement Vos missions Votre profil...,06/02/2019,Développeur,Rhône,,,0,0,0,0,0,0,0,0,0,1,0,0,resume environnement missions profil condition...,1,1,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,Limonest,69,600.0,850.0,month,725.0


### DATA CLEANING

### Missing values

#### Drop lines without salary

In [16]:
a = df_salary[df_salary["salary_period"]=="year"] # visualisation lignes saliare mensuelle
a.shape

(675, 56)

In [17]:
a.experience_min.isna().sum()

275

In [18]:
275/675*100

40.74074074074074

In [19]:
b = df_salary[df_salary["salary_period"]=="month"] # visualisation lignes saliare mensuelle
print(b.shape)
print(b.experience_min.isna().sum())

(30, 56)
20


In [20]:
20/30 * 100

66.66666666666666

In [21]:
c = df_salary[df_salary["salary_period"]=="week"] # visualisation lignes saliare mensuelle
print(c.shape)
print(c.experience_min.isna().sum())

(39, 56)
24


In [22]:
24/59 * 100

40.67796610169492

In [None]:
df_salary=pd.read_csv("/Users/fabi/Documents/Data AI Ecouen/PROJECTS/salary_indeed.csv")

In [23]:
df_salary

Unnamed: 0,_id,Title,Company,Location,Salary,Description,Date,Job_Search,Department_Search,experience_min,experience_max,BTS,Deug,DUT,IUT,Licence,Master,Doctorat,Ecole_Ingénieure,cdi,stage,freelance,cdd,description_clean,Python,Java,Machine Learning,Deep Learning,Javascript,Swift,NoSql,SQL,Agile,J2ee,Jee,Ruby,HTML_CSS,Php,Spark,Big_data,Scala,Back_end,Git,R,Powerbi,Cloud,C,Docker,ABAP,WLang,city,cp,salary_min,salary_max,salary_period,salary_mean
8,p_008d77a008590232,INGÉNIEUR DÉVELOPPEUR C++ / JAVA F/H,ALTEN,Toulouse (31),30 000 € - 45 000 € par an,Partenaire technologique de référence des plus...,08/09/2019,Développeur,Haute-Garonne,1.0,1.0,0,0,0,0,0,1,0,0,1,0,0,0,partenaire technologique reference plus grande...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Toulouse,31,30000.0,45000.0,year,37500.0
15,p_00f2ad8db3c30b8e,Devops F/H,Kent FR,Bordeaux (33),35 000 € - 40 000 € par an,"KENT, marque de GroupAgora dédiée aux recrutem...",09/03/2019,Développeur,Gironde,2.0,2.0,0,0,0,0,0,1,0,0,0,0,0,0,kent marque groupagora dediee recrutements pro...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,Bordeaux,33,35000.0,40000.0,year,37500.0
17,p_00fa03a21ad52a71,Data Scientist H/F,QYSY,Paris (75),40 000 € - 65 000 € par an,QYSY est un cabinet de conseil et de recruteme...,29/09/2019,Data,75,2.0,2.0,0,0,0,0,0,1,0,0,1,0,0,0,qysy cabinet conseil recrutementqui propose re...,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,Paris,75,40000.0,65000.0,year,52500.0
19,p_0130bef769eef3cc,BUSINESS DEVELOPER BTOB - DATA INTELLIGENCE (H/F),L.I.P,Lyon 3e (69),35 000 € - 60 000 € par an,69003 - Lyon-3e-Arrondissement CDI Bac+5 et pl...,07/10/2019,Data,Rhône,2.0,5.0,0,0,0,0,0,1,0,0,1,0,0,0,lyon 3e arrondissement cdi bac+5 plus ingenieu...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,Lyon,69,35000.0,60000.0,year,47500.0
31,p_019d01b7494dc632,Référent fonctionnel et données - Data manager...,GARCIN FINANCES ET PARTICIPATIONS,Corbas (69),40 000 € - 45 000 € par an,Vous souhaitez participer à la structuration d...,10/09/2019,Data,Rhône,1.0,1.0,0,0,0,0,0,1,0,0,1,0,0,0,souhaitez participer structuration donnees usa...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Corbas,69,40000.0,45000.0,year,42500.0
44,p_02074799b0dbc42e,Développeur PHP / Laravel,Sept Lieues,Paris (75),32 000 € - 37 000 € par an,PME de près de 20 ans d'existence dans la gest...,08/04/2019,Développeur,75,2.0,2.0,0,0,0,0,1,0,0,0,0,0,0,0,pme pres ans existence gestion situations cris...,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,Paris,75,32000.0,37000.0,year,34500.0
49,p_02270c340e21e7ab,Développeur WinDev H/F,Axemploi Recrutement,Nantes (44),30 000 € - 40 000 € par an,DEVELOPPEUR WINDEV H/F Vous interviendrez sur...,18/09/2019,Développeur,Loire-Atlantique,,,0,0,0,0,0,0,0,0,1,0,0,0,developpeur windev interviendrez developpement...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,Nantes,44,30000.0,40000.0,year,35000.0
66,p_033110c9c03332f9,Développeur Java H/F,WOM,Lyon (69),30 000 € - 40 000 € par an,"Notre client, spécialisé dans l’édition de log...",03/10/2019,Développeur,Rhône,1.0,1.0,0,0,0,0,0,1,0,0,1,0,0,0,client specialise edition logiciel rse recherc...,0,1,0,0,1,0,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,Lyon,69,30000.0,40000.0,year,35000.0
87,p_046189897fdb70ed,CONTRÔLEUR DE GESTION - DATA ANALYST,SH Conseils,Paris 8e (75),35 000 € - 38 000 € par an,Pour le compte d’un Opérateur Télécoms de prem...,18/09/2019,Data,75,2.0,3.0,0,0,0,0,0,1,0,0,1,0,0,0,compte operateur telecoms premier plan france ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Paris,75,35000.0,38000.0,year,36500.0
105,p_05b4d08c39d1a13c,Développeur Mobile (Senior) iOS / Swift H/F,EXTERNATIC,Bordeaux (33),40 000 € - 50 000 € par an,"DESCRIPTION DE L'OFFRE Externatic, le hub d’op...",07/09/2019,Développeur,Gironde,3.0,5.0,0,0,0,0,0,1,0,0,0,0,0,0,description offre externatic hub opportunites ...,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,1,1,1,1,1,0,1,0,1,0,0,Bordeaux,33,40000.0,50000.0,year,45000.0


#### Manage missing values

Ivan

si salaire en ANNUEL mais pas de contrat renseigné --> arbitrage en mettant le contrat le plus fréquent cad CDI

si salaire en HEURE/JOUR/SEMAINE mais pas de contrat renseigné --> arbitrage en mettant le contrat = FREELANCE

### Features

#### For basic Model

x
-  experiences
-  contract
-  competences & keys words
-  qualifications
-  departement_search
-  job_search

y
- salaire min, 

#### With CountVectorizer Vizer application