In [680]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



import re

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17



import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

In [681]:
data = pd.read_csv(f'../app/data/processed_data.csv')


In [682]:
data.text[0]

"job descriptiondata scientist, marketingsan diego, ca /analytics – data science /full-timeheadquartered in san diego, we serve as a leading provider of working capital ($5k - $1.5m) to the small and medium-sized businesses that fuel our country. since 2008, we have prided ourselves on our collaborative, innovative, and customer-focused approach. enjoying a period of unprecedented growth, driven by the combination of cutting-edge technology, human touch, and unwavering integrity, we are looking to add to our people-first culture, with highly motivated and results-oriented professionals, to push the limits of what's possible while creating value for all of our partners.we are seeking a mid-level to senior level statistician, quantitative modeling specialist, or data scientist to join our analytics team and build predictive models for marketing. if you have exceptional analytical, quantitative and problem-solving skills, demonstrated experience designing and implementing predictive model

In [683]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


In [684]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|(0-9)]',r'',clean_sentence.strip(' '))
        if len(clean_sentence.strip()) > 1:
            clean_sentences.append(clean_sentence)
    done_sent = ''
    for i in clean_sentences:
        done_sent += (' '+i)
    return done_sent.strip()
    

data['cleaned'] = data.text.apply(clean_text)

In [685]:
data.cleaned[0]

'job descriptiondata scientist marketingsan diego ca analytics – data science full-timeheadquartered in san diego we serve as a leading provider of working capital k -  m to the small and medium-sized businesses that fuel our country since  we have prided ourselves on our collaborative innovative and customer-focused approach enjoying a period of unprecedented growth driven by the combination of cutting-edge technology human touch and unwavering integrity we are looking to add to our people-first culture with highly motivated and results-oriented professionals to push the limits of whats possible while creating value for all of our partners we are seeking a mid-level to senior level statistician quantitative modeling specialist or data scientist to join our analytics team and build predictive models for marketing if you have exceptional analytical quantitative and problem-solving skills demonstrated experience designing and implementing predictive models and analytics in marketing a pr

In [686]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',clean_sentence)
        clean_sentence = " ".join(re.findall("[(a-zA-Z,&)]+", clean_sentence))
        clean_sentences.append(clean_sentence)
    clean_text = ''
    for i in clean_sentences:
        clean_text += (' '+i)
    return clean_text.strip(' ')

    

data['cleaned'] = data.text.apply(clean_text)



In [687]:
data.cleaned[3]

'li remote about eab at eab our mission is to make education smarter and our communities stronger we work with more than institutions to drive transformative change through data driven insights and best in class capabilities from kindergarten to college to career eab partners with leaders and practitioners to accelerate progress and drive results across five major areas enrollment student success institutional strategy data & analytics and diversity equity and inclusion de&i we work with each partner differently tailoring our portfolio of research technology and marketing and enrollment solutions to meet the unique needs of every leadership team as well as the students and employees they serve at eab we serve not only our partner institutions but each other thats why we are always working to make sure our employees love their jobs and are invested in their communities see how weve been recognized for this dedication to our employees by checking out our recent awards for more informatio

### 4.1 Target
Before I can begin splitting the data I need to set the target for my methodology of training four seperate logistic regression models. I'm doing this because I'd like my classifications to be as accurate as possible, and also, by building my NLP strategy around a particular label, i.e. finding common words for that label as opposed to being generalized through the entire corpus\

I'm going to one-hot-encode the target feature so I can select each of the next columns as my y - one for each model.

In [688]:
data = data[['company','job_title','state','city','rating','cleaned','target']]

In [689]:
ohe = OneHotEncoder(sparse=False, dtype='int')
targets = ohe.fit_transform(pd.DataFrame(data.target))


In [690]:
targets = pd.DataFrame(targets,columns=['Q1','Q2','Q3','Q4','unk'])


In [691]:
data = data.join(targets)
data.head()

Unnamed: 0,company,job_title,state,city,rating,cleaned,target,Q1,Q2,Q3,Q4,unk
0,online technical services,data scientist - marketing,remote,remote,3.7,job descriptiondata scientist marketingsan die...,4.0,0,0,0,1,0
1,west cap,"data scientist, botguard",ny,remote in new york,3.5,human was founded in in a brooklyn sci fi book...,2.0,0,1,0,0,0
2,techtrueup,mcs data scientist,remote,remote,3.8,description data scientist fully remote develo...,3.0,0,0,1,0,0
3,eab,associate data scientist,dc,remote in washington,3.7,li remote about eab at eab our mission is to m...,1.0,1,0,0,0,0
4,redfin,senior data analyst - tour support (remote eli...,remote,remote,3.4,this position is a remote eligible position yo...,2.0,0,1,0,0,0


In [692]:
data.drop(['target','unk'], axis=1,inplace=True)

In [693]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


data['comment_text'] = data['cleaned'].apply(stemming)

In [694]:
data.comment_text[0]

'job descriptiondata scientist marketingsan diego ca analyt data scienc full timeheadquart in san diego we serv as a lead provid of work capit k m to the small and medium size busi that fuel our countri sinc we have pride ourselv on our collabor innov and custom focus approach enjoy a period of unpreced growth driven by the combin of cut edg technolog human touch and unwav integr we are look to add to our peopl first cultur with high motiv and result orient profession to push the limit of what possibl while creat valu for all of our partner we are seek a mid level to senior level statistician quantit model specialist or data scientist to join our analyt team and build predict model for market if you have except analyt quantit and problem solv skill demonstr experi design and implement predict model and analyt in market a proven track record of bring thought leadership to problem and the desir to make a rapid impact on the success of the busi this is an opportun for you the ideal candid

In [695]:
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
def stemming(sentence):
    LemSentence = ""
    for word in sentence.split():
        stem = lemmatizer.lemmatize(word)
        LemSentence += stem
        LemSentence += " "
    LemSentence = LemSentence.strip()
    return LemSentence


data['comment_text_lem'] = data['cleaned'].apply(stemming)
data.comment_text_lem[0]

'job descriptiondata scientist marketingsan diego ca analytics data science full timeheadquartered in san diego we serve a a leading provider of working capital k m to the small and medium sized business that fuel our country since we have prided ourselves on our collaborative innovative and customer focused approach enjoying a period of unprecedented growth driven by the combination of cutting edge technology human touch and unwavering integrity we are looking to add to our people first culture with highly motivated and result oriented professional to push the limit of whats possible while creating value for all of our partner we are seeking a mid level to senior level statistician quantitative modeling specialist or data scientist to join our analytics team and build predictive model for marketing if you have exceptional analytical quantitative and problem solving skill demonstrated experience designing and implementing predictive model and analytics in marketing a proven track recor

In [696]:
data.drop(['cleaned','comment_text'], axis=1,inplace=True)

In [697]:
data

Unnamed: 0,company,job_title,state,city,rating,Q1,Q2,Q3,Q4,comment_text_lem
0,online technical services,data scientist - marketing,remote,remote,3.7,0,0,0,1,job descriptiondata scientist marketingsan die...
1,west cap,"data scientist, botguard",ny,remote in new york,3.5,0,1,0,0,human wa founded in in a brooklyn sci fi books...
2,techtrueup,mcs data scientist,remote,remote,3.8,0,0,1,0,description data scientist fully remote develo...
3,eab,associate data scientist,dc,remote in washington,3.7,1,0,0,0,li remote about eab at eab our mission is to m...
4,redfin,senior data analyst - tour support (remote eli...,remote,remote,3.4,0,1,0,0,this position is a remote eligible position yo...
...,...,...,...,...,...,...,...,...,...,...
1025,etsy,senior applied scientist ii-knowledge base,wa,remote in seattle,4.3,0,0,0,0,company description etsy is the global marketp...
1026,nike,"senior data scientist (open to remote work, ex...",or,remote in beaverton,4.1,0,0,0,0,become a part of the nike inc teamnike inc doe...
1027,windstream communications,senior data scientist,remote,remote,3.2,0,0,0,0,we are looking for a data scientist to help u ...
1028,nike,"senior data scientist (open to remote work, ex...",or,remote in beaverton,4.1,0,0,0,0,become a part of the nike inc teamnike inc doe...


In [698]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [699]:
q1_data = data.drop(['Q2','Q3','Q4'], axis=1)
q2_data = data.drop(['Q1','Q3','Q4'], axis=1)
q3_data = data.drop(['Q1','Q2','Q4'], axis=1)
q4_data = data.drop(['Q1','Q2','Q3'], axis=1)

In [700]:
X = q1_data.drop(['Q1'], axis=1)
y = q1_data['Q1']


q1_vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english')
corpus = X.comment_text_lem
q1_vectorizer.fit_transform(corpus).toarray()
vectorized = pd.DataFrame(q1_vectorizer.fit_transform(corpus).toarray(),columns=q1_vectorizer.get_feature_names())


X = X.join(z,lsuffix='_')
X = X.fillna(0)



x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)

In [701]:
x_train = x_train.drop('comment_text_lem',axis=1)
x_test = x_test.drop('comment_text_lem',axis=1)

In [702]:
le_cols = x_train.select_dtypes('object').columns
le_cols

Index(['company_', 'job_title', 'state_', 'city_'], dtype='object')

In [703]:
#scaler_features = num_cols
nlp_cols = ['comment_text_lem']
le_cols = x_train.select_dtypes('object').columns
scal_cols = ['rating']

one_hot_encoder = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown ='ignore'))
])

vect = Pipeline(steps=[
    ('vect', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english'))
])

#ord_enc = OrdinalEncoder(handle_unknown='rder')

label_encoder = Pipeline(steps=[
    ('label_enc', LabelEncoder())
])
scaler_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

minmax_scalar_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])


preprocessor = ColumnTransformer(
    transformers=[
        #('ord_cat', ordinal_cat_encoder, ordinal_cat_features),
        ('ohe', one_hot_encoder, le_cols),
        #('ord',ord_enc,le_cols),
        ('scaler', scaler_transformer,scal_cols),
        #('ohe', one_hot_encoder, nom_cols),
        #('binarize', one_hot_encoder, bin_cols),
        #('vect',vect,nlp_cols),
        #('label_enc', label_encoder, le_cols),
        
        #('minmax_scaler', minmax_scalar_transformer,scaler_features)
    ],remainder='passthrough'
)

transformer = Pipeline(steps=[('preprocessor', preprocessor)])


In [704]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline(
    steps=[
        ("preprocessor", transformer), 
        ("logreg", LogisticRegression())
        ]
    )

pipe.fit(x_train,y_train)



In [705]:
pd.DataFrame(cross_validate(pipe, x_train, y_train, scoring = 'accuracy',return_train_score=True,error_score="raise",cv=10)).mean()

fit_time       0.434359
score_time     0.028966
test_score     0.970835
train_score    0.979504
dtype: float64

In [706]:
pipe.score(x_train, y_train)


0.9805825242718447

In [707]:
x_train

Unnamed: 0,company_,job_title,state_,city_,rating,ab,ability,ability work,able,accelerate,...,writing,written,written verbal,year,year experience,year professional,year professional experience,youll,youre,zone
995,spotify,"data scientist, advertising economics",ny,remote in new york,4.3,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
507,s&p global,data scientist,tn,remote in nashville,3.9,0.000000,0.029699,0.0,0.0,0.0,...,0.00000,0.043425,0.054596,0.115707,0.073827,0.000000,0.000000,0.0,0.0,0.0
334,cybercoders,remote senior data analyst,ca,remote in san francisco,3.7,0.053519,0.000000,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.042137,0.000000,0.000000,0.000000,0.0,0.0,0.0
848,galaxe.solutions,data analyst,wi,remote in milwaukee,2.5,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
294,galaxe.solutions,data analyst,wi,remote in milwaukee,2.5,0.000000,0.000000,0.0,0.0,0.0,...,0.08536,0.000000,0.000000,0.082743,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,online technical services,data scientist - marketing,remote,remote,3.7,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.028055,0.000000,0.000000,0.000000,0.0,0.0,0.0
330,cybercoders,principal data scientist,wa,remote in seattle,3.7,0.194667,0.024588,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.019159,0.030561,0.000000,0.000000,0.0,0.0,0.0
466,sparkcognition,machine learning engineer,tx,remote in austin,4.4,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.083019,0.000000,0.088482,0.070571,0.000000,0.000000,0.0,0.0,0.0
121,amadeus,principal data scientist - network planning fo...,remote,remote,3.9,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.041024,0.000000,0.110709,0.115231,0.0,0.0,0.0


In [708]:
y_pred = pipe.predict(x_test)

In [709]:
x_train.shape

(824, 1005)

In [710]:
x_test.shape

(206, 1005)

In [711]:
x_train.columns

Index(['company_', 'job_title', 'state_', 'city_', 'rating', 'ab', 'ability',
       'ability work', 'able', 'accelerate',
       ...
       'writing', 'written', 'written verbal', 'year', 'year experience',
       'year professional', 'year professional experience', 'youll', 'youre',
       'zone'],
      dtype='object', length=1005)

In [712]:
x_test.columns

Index(['company_', 'job_title', 'state_', 'city_', 'rating', 'ab', 'ability',
       'ability work', 'able', 'accelerate',
       ...
       'writing', 'written', 'written verbal', 'year', 'year experience',
       'year professional', 'year professional experience', 'youll', 'youre',
       'zone'],
      dtype='object', length=1005)

In [713]:
scores_processed = cross_validate(pipe, x_train, y_train, return_train_score=True)
pd.DataFrame(scores_processed)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.258841,0.016938,0.975758,0.981791
1,0.435632,0.025419,0.981818,0.977238
2,0.543952,0.03312,0.957576,0.978756
3,0.677501,0.049773,0.957576,0.981791
4,0.629385,0.023546,0.987805,0.972727


In [714]:
from sklearn import set_config

set_config(display='diagram')
pipe

In [715]:
scores_processed = cross_validate(pipe, x_train, y_train, return_train_score=True)

In [716]:
pd.DataFrame(scores_processed)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.234286,0.02055,0.975758,0.981791
1,0.161977,0.020794,0.981818,0.977238
2,0.164715,0.016729,0.957576,0.978756
3,0.309675,0.016896,0.957576,0.981791
4,0.215975,0.012218,0.987805,0.972727


In [717]:
from sklearn.metrics import accuracy_score, hamming_loss,precision_score,recall_score,f1_score,classification_report
predictions = pipe.predict(x_test)

In [718]:
print("Accuracy :",accuracy_score(y_test, predictions))

Accuracy : 0.9757281553398058


In [719]:

print("Hamming loss ",hamming_loss(y_test,predictions))

Hamming loss  0.024271844660194174


In [720]:
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

In [721]:
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.9757, Recall: 0.9757, F1-measure: 0.9757


In [722]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

In [723]:
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Macro-average quality numbers
Precision: 0.9430, Recall: 0.9737, F1-measure: 0.9575


In [724]:
print("\nClassification Report")
print (classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       172
           1       0.89      0.97      0.93        34

    accuracy                           0.98       206
   macro avg       0.94      0.97      0.96       206
weighted avg       0.98      0.98      0.98       206



In [725]:
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0])

In [726]:
%reset