# Inicialización

In [1]:
#Imports
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.utils import resample

In [2]:
# Declaración de variables inicial
df_original = pd.read_csv('fake_job_postings.csv', index_col=0)
df_mod = df_original.copy()

# Declaración de funciones inicial
def evalua_preprocesamiento_fake_job(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    f1 = metrics.fbeta_score(y_test, y_hat, average='macro', beta=5)
    print('F1-Score: {:6.4f}'.format(f1))
    print(list(zip(y_test, y_hat)))
df_mod.head()

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


# Target

In [3]:
# Crea el dataframe target y lo elimina de los datos
df_fraudulent = df_mod[df_mod.fraudulent==1]
df_not_fraudulent = df_mod[df_mod.fraudulent==0]

df_not_fraudulent_downsampled = resample(df_not_fraudulent, replace=False, n_samples=12000, random_state=0)
df_balanced = pd.concat([df_not_fraudulent_downsampled, df_fraudulent]).sample(frac=1).reset_index(drop=True)

df_target = df_balanced[['fraudulent']]
df_mod = df_balanced.drop(['fraudulent'], axis=1)

In [4]:
df_balanced.tail()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
12861,English Teacher Abroad,"US, OR, Eugene",,,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it Love travel? J...",University degree required. TEFL / TESOL / CEL...,See job description,0,1,1,Contract,,Bachelor's Degree,Education Management,,0
12862,Freelance writer/analyst for Spain,"ES, MD, Madrid",,,The Advanced Interactive Media Group LLC - or ...,The AIM Group is the world’s leading consultan...,We're looking to strenghen our coverage of Spa...,Working within a dedicated and enthusiastic vi...,1,1,1,Contract,,,Internet,Writing/Editing,0
12863,CARETAKER COUPLE LIVE IN,"GB, NYK, ROBIN HOODS BAY",HOLIDAY COTTAGES,,,Job Vacancy: caretakersComes with a fully furn...,,comes with1 bed roomed detached stone cottage,0,0,0,,,,,,0
12864,Electrical Maintenance Technician,"US, IL, Chicago",,,We Provide Full Time Permanent Positions for m...,(We have more than 1500+ Job openings in our w...,,,0,0,0,Full-time,,,,,0
12865,Head of Marketing,"SG, 01, Singapore",Marketing,65000-120000,If working in a cubical seems like your idea o...,Startup marketer? Growth &amp; Demand generati...,Approx 3-5 years of progressive experience in ...,"Awesome work enviroment, Awesome team!Healthca...",0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Marketing,0


# Location

In [5]:
# Itera el dataframe
for i, row in df_mod.iterrows():
    # Comprueba valores NA de localización y divide el string asociandolo a nuevas columnas
    if pd.notna(row.location):
        location =  row.location.split(',')
        df_mod.loc[i,'country'] = location[0]
        if len(location) > 1:
            df_mod.loc[i,'state'] = location[1]
        if len(location) > 2:
            df_mod.loc[i,'city'] = location[2]

# One hot encoding de nuevas columnas
df_mod = pd.get_dummies(df_mod, columns=['country','state','city'])

# Elimina la columna original
df_mod = df_mod.drop(['location'], axis=1)

# Department and salary

In [6]:
# Sutituimos las celdas con información por 1 y las que no tienen con 0
df_mod['department'][df_mod.department.notnull()] = 1
df_mod['department'][df_mod.department.isnull()] = 0
df_mod['salary_range'][df_mod.salary_range.notnull()] = 1
df_mod['salary_range'][df_mod.salary_range.isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


# Industry, function, employment, experience and education

In [7]:
# Ponemos NAN a 'Missing' 
df_mod['industry'][df_mod.industry.isnull()] = 'Missing'
df_mod['function'][df_mod.function.isnull()] = 'Missing'
df_mod['employment_type'][df_mod.employment_type.isnull()] = 'Missing'
df_mod['required_experience'][df_mod.required_experience.isnull()] = 'Missing'
df_mod['required_education'][df_mod.required_education.isnull()] = 'Missing'

# One hot encoding de nuevas columnas
df_mod = pd.get_dummies(df_mod, columns=['industry','function','employment_type',
                                         'required_experience','required_education'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a sl

# Procesamiento de texto: title, company profile, description, requirements, benefits

In [8]:
# Ponemos NAN a 'Missing' 
df_mod['company_profile'][df_mod.company_profile.isnull()] = 'Missing'
df_mod['description'][df_mod.description.isnull()] = 'Missing'
df_mod['requirements'][df_mod.requirements.isnull()] = 'Missing'
df_mod['benefits'][df_mod.benefits.isnull()] = 'Missing'

# Definición de la función de procesado
def procesar_texto(df, col_name):
    #Creo el corpus
    corpus = df[col_name].values
    corpus = [re.sub('[^a-zA-Z\s]' ,' ', text).lower() for text in corpus]
    
    #Hago stemming
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(corpus)):
        corpus[i] = ' '.join([wordnet_lemmatizer.lemmatize(palabra, pos='n') for palabra in word_tokenize(corpus[i])])

    #Hago tfidf
    tfidf_vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'), min_df=0.0002)
    X = tfidf_vectorizer.fit_transform(corpus)
    
    #Creo el DataFrame
    words_names = tfidf_vectorizer.get_feature_names()
    for i, word in enumerate(words_names):
        words_names[i]+=col_name    
    df_texto_procesado = pd.DataFrame.sparse.from_spmatrix(X,columns=words_names)
    
    return df_texto_procesado

# Procesado de columnas
df_title = procesar_texto(df_mod, 'title')
df_company_profile = procesar_texto(df_mod, 'company_profile')
df_description = procesar_texto(df_mod, 'description')
df_requirements = procesar_texto(df_mod, 'requirements')
df_benefits = procesar_texto(df_mod, 'benefits')

# Eliminamos las columnas procesadas
df_mod = df_mod.drop(['title','company_profile','description','requirements','benefits'], axis=1)

# Concatenamos los nuevos dataframes
df_mod = pd.concat([df_mod,df_title, df_company_profile, df_description, df_requirements, df_benefits], axis=1, sort=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


# Visualización

In [9]:
df_mod.shape

(12866, 43143)

In [10]:
df_mod.head()

Unnamed: 0,department,salary_range,telecommuting,has_company_logo,has_questions,country_AE,country_AL,country_AM,country_AR,country_AT,...,zelfstandigebenefits,zenefitsbenefits,zerobenefits,zipcarbenefits,zodatbenefits,zollmanbenefits,zombiebenefits,zonebenefits,zoningbenefits,zultbenefits
0,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
evalua_preprocesamiento_fake_job(df_mod, df_target.values.ravel())

F1-Score: 0.8387
[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)