# Model

In [7]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from prepare import prep_create_labels
from preprocess import prep_job_data, split_job_data, add_columns

from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Wrangle and Split Data

In [8]:
df = pd.read_json('indeed-data-jobs-FINAL.json')
df = prep_create_labels(df).reset_index(drop=True)
df = prep_job_data(df, 'job_description', extra_words=['job', 'description']).reset_index(drop=True)
df = add_columns(df)
df.sample(5)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label,clean,stemmed,lemmatized,words,doc_length
651,Sr. Machine Learning Engineer - Discovery,Twitter,"Seattle, WA",0,,6 days ago,2021-03-05,Company Description Twitter is what’s happenin...,MLE,company twitter whats happening people talking...,compani descript twitter is what happen and wh...,company description twitter is whats happening...,"[company, twitter, whats, happening, people, t...",366
79,Data Scientist,EagleView,"Rochester, NY 14607",0,,2 days ago,2021-03-05,We are looking for a talented Data Scientist t...,DS,looking talented data scientist join machine l...,we are look for a talent data scientist to joi...,we are looking for a talented data scientist t...,"[looking, talented, data, scientist, join, mac...",144
92,Data Scientist/Engineer,Oracle,United States,0,,10 days ago,2021-03-05,Data Scientist/Engineer -210009CG Applicants ...,DS,data scientistengineer 210009cg applicant requ...,data scientistengin 210009cg applic are requir...,data scientistengineer 210009cg applicant are ...,"[data, scientistengineer, 210009cg, applicant,...",557
84,Data & Applied Scientist,Microsoft,"Bellevue, WA",0,,5 days ago,2021-03-05,What if your job description were: make tomorr...,DS,make tomorrow better essence role within micro...,what if your job descript were make tomorrow b...,what if your job description were make tomorro...,"[make, tomorrow, better, essence, role, within...",353
296,Data Analyst (Sales Operations),Businessolver,Remote,0,"$80,000 - $90,000 a year",7 days ago,2021-03-05,Businessolver is a technology company deliveri...,DA,businessolver technology company delivering ma...,businessolv is a technolog compani deliv marke...,businessolver is a technology company deliveri...,"[businessolver, technology, company, deliverin...",476


In [9]:
train, validate, test = split_job_data(df)
print(f'train: {round(train.shape[0]/len(df),2)}')
print(f'validate: {round(validate.shape[0]/len(df),2)}')
print(f'test: {round(test.shape[0]/len(df),2)}')

train: 0.6
validate: 0.2
test: 0.2


In [10]:
train.sample(5)

Unnamed: 0,label,job_title,company,location,is_remote,clean,words,doc_length
472,DE,Data Integration Engineer / Architect,Rayn Solutions LLC,Remote,1,looking talented experienced consultant long t...,"[looking, talented, experienced, consultant, l...",295
75,DS,Data Scientist (all levels),Cascade Data Labs,"Portland, OR 97202",1,cascade data lab boutique consulting agency en...,"[cascade, data, lab, boutique, consulting, age...",236
464,DE,Data Analytics Engineer,Ford Motor Company,"Dearborn, MI",0,dearborn ford motor company product developmen...,"[dearborn, ford, motor, company, product, deve...",334
131,DS,Senior Data Scientist,Verusen,"Atlanta, GA 30308",0,company verusen leading technology company us ...,"[company, verusen, leading, technology, compan...",367
693,MLE,Machine Learning Intern,Knock,United States,1,knock mission empower people move freely knock...,"[knock, mission, empower, people, move, freely...",311


Will run Bag of words and TFIDF models and try additional models if time permits

In [11]:
# bag of words model
vectorizer = CountVectorizer(stop_words='english', 
                             min_df=30, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.clean)

# Get dictionary. 
vectorizer.get_feature_names()

# Transform each sentences in vector space.
bow = vectorizer.transform(train.clean)

# this is just to see the array of 0's and 1's
bow_array = bow.toarray()

X_bow = bow

In [12]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', 
                        min_df=30, 
                        ngram_range=(1,2), 
                        binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(train.clean)

# Get vocabularies.
tfidf.vocabulary_

# Transform to document-term matrix
vector_spaces = tfidf.transform(train.clean)
vector_spaces.toarray()

X_tfidf = tfidf_sparse_matrix

In [14]:
# create our y (label) dataset
y = train['label']

In [16]:
# BOW Log Reg Model
lm = LogisticRegression().fit(X_bow, y)
train['predict_bow'] = lm.predict(X_bow)

# TF-IDF Log Reg Model
lm_tfidf = LogisticRegression().fit(X_tfidf, y)
train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

In [18]:
print('Bag of Words Model \n', classification_report(train.label, train.predict_bow))
pd.crosstab(train.label, train.predict_bow)

Bag of Words Model 
               precision    recall  f1-score   support

          DA       0.99      1.00      1.00       112
          DE       0.99      1.00      1.00       103
          DS       1.00      0.99      1.00       103
         MLE       1.00      0.99      1.00       108

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



predict_bow,DA,DE,DS,MLE
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DA,112,0,0,0
DE,0,103,0,0
DS,1,0,102,0
MLE,0,1,0,107


In [20]:
print('TF-IDF Model \n', classification_report(train.label, train.pred_tfidf))
pd.crosstab(train.label, train.pred_tfidf)

TF-IDF Model 
               precision    recall  f1-score   support

          DA       0.98      0.97      0.98       112
          DE       0.98      0.98      0.98       103
          DS       0.96      0.97      0.97       103
         MLE       0.96      0.96      0.96       108

    accuracy                           0.97       426
   macro avg       0.97      0.97      0.97       426
weighted avg       0.97      0.97      0.97       426



pred_tfidf,DA,DE,DS,MLE
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DA,109,0,3,0
DE,1,101,0,1
DS,0,0,100,3
MLE,1,2,1,104
