In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
set_config(display='diagram')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, roc_auc_score, roc_curve, hamming_loss,precision_score,recall_score,f1_score

from confusion import make_confusion_matrix
import re

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17

import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

In [2]:
data = pd.read_csv(f'../app/data/processed_data.csv')


In [3]:
data.head()

Unnamed: 0,company,description,location,rating,requirements,summary,job_title,url,text,salary,dateposted,state,city,target
0,synergisticit,"at synergisticit, we aim to bring aboard it ...",alabama,4.2,,"collaborate with dynamic teams of engineers, d...",entry level data scientist,https://www.indeed.com/rc/clk?jk=57d47b0524890...,"at synergisticit, we aim to bring aboard it ...",88000.0,2022-06-20,alabama,alabama,1.0
1,synergisticit,about us: synergistic it is a full-service s...,"mountain brook, al 35223",4.2,,undertaking machine learning experiments and t...,machine learning developer,https://www.indeed.com/rc/clk?jk=e92afb112aad3...,about us: synergistic it is a full-service s...,107000.0,2022-06-20,al,mountain brook,2.0
2,"ierus technologies, inc.",ierus specializes in r&d and low-rate producti...,"huntsville, al 35805",4.7,,"our applications include: radar, eo/ir, rf sig...",machine learning/artificial intelligence softw...,https://www.indeed.com/company/IERUS-Technolog...,ierus specializes in r&d and low-rate producti...,120000.0,2022-06-20,al,huntsville,2.0
3,synergisticit,"at synergisticit, we aim to bring aboard it ...",arkansas,4.2,,"collaborate with dynamic teams of engineers, d...",entry level data scientist,https://www.indeed.com/rc/clk?jk=91d02dbbb1961...,"at synergisticit, we aim to bring aboard it ...",90000.0,2022-06-20,arkansas,arkansas,1.0
4,indeed,,"phoenix, az",4.3,,our data scientists build and implement machin...,data science manager - job seeker profiles,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,,201000.0,2022-06-21,az,phoenix,4.0


In [4]:
data.text[0]

"  at synergisticit, we aim to bring aboard it professionals to help them build a rewarding career in cutting-edge technologies. being in the industry for more than 10 years, we provide a splendid range of lucrative opportunities to sustain a position in our top tech clients like google, apple, cognizant, client, paypal, to name a few.   our seasoned team firmly believes that the new tech talent can scale any business if given the right opportunity. we value your integrity, hard work, and commitment to make a difference in the technical sphere. for this reason, we focus on providing end-to-end career assistance and enhancing your already existing it skills and knowledge.    currently, we are looking for qualified entry-level data scientists who can apply data science principles to design, test, implement, and develop data-based solutions, including reporting, auditing, and preparing large databases for statistical analysis.   minimum background and qualifications requirement bachelor's

In [5]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


In [12]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|(0-9)]',r'',clean_sentence.strip(' '))
        if len(clean_sentence.strip()) > 1:
            clean_sentences.append(clean_sentence)
    done_sent = ''
    for i in clean_sentences:
        done_sent += (' '+i)
    return done_sent.strip()
    

data['cleaned'] = data[data.text.notnull()].apply(clean_text)

AttributeError: 'Series' object has no attribute 'split'

In [None]:
z = data[data.text.isna()]
len(z)
z

In [None]:
for i in z.url:
    print(i,'\n')
z

In [None]:
data[data.description.isna()].location.value_counts()

In [None]:
data.info()

In [None]:
data.cleaned[0]

In [None]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',clean_sentence)
        clean_sentence = " ".join(re.findall("[(a-zA-Z,&)]+", clean_sentence))
        clean_sentences.append(clean_sentence)
    clean_text = ''
    for i in clean_sentences:
        clean_text += (' '+i)
    return clean_text.strip(' ')

    

data['cleaned'] = data.text.apply(clean_text)



In [None]:
data.cleaned.iloc[1]

### 4.1 Target
Before I can begin splitting the data I need to set the target for my methodology of training four seperate logistic regression models. I'm doing this because I'd like my classifications to be as accurate as possible, and also, by building my NLP strategy around a particular label, i.e. finding common words for that label as opposed to being generalized through the entire corpus\

I'm going to one-hot-encode the target feature so I can select each of the next columns as my y - one for each model.

In [None]:
data = data[['company','job_title','state','city','rating','cleaned','target','salary', 'dateposted','summary','url']]
data

In [None]:
ohe = OneHotEncoder(sparse=False, dtype='int')
targets = ohe.fit_transform(pd.DataFrame(data.target))
targets

In [None]:
targets = pd.DataFrame(targets,columns=['Q1','Q2','Q3','Q4','unk'])


In [None]:
data = data.join(targets)
data.head()

In [None]:
data.drop(['unk'], axis=1,inplace=True)

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


data['comment_text'] = data['cleaned'].apply(stemming)

In [None]:
data.comment_text[0]

In [None]:
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
def stemming(sentence):
    LemSentence = ""
    for word in sentence.split():
        stem = lemmatizer.lemmatize(word)
        LemSentence += stem
        LemSentence += " "
    LemSentence = LemSentence.strip()
    return LemSentence


data['comment_text_lem'] = data['cleaned'].apply(stemming)
data.comment_text_lem[0]

In [None]:
data.drop(['cleaned','comment_text'], axis=1,inplace=True)

In [None]:
data

I need to make one model for each target. Before doing so, I'll need to use gridsearch to find the best hyperperamters.
Looping through each target, I set y to that specific single-column binary '(for target in targets target == 0 or 1)'
Then do a test train split before setting up the pipeline. The grid being use in this case is the selection of hyperperamters I want to check.


In [None]:
def grid_search_pipe():
    targets = ['Q1','Q2','Q3','Q4']
    X = data.drop(targets, axis=1)

    le_cols = ['company', 'job_title', 'state', 'city']
    scal_cols = ['rating']

    evaluations = {}

    
    for i in targets:
    # test/train split
        y = data[i]
        x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)


        preprocessor = ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'comment_text_lem'),
                ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
                ('scaler', StandardScaler(), scal_cols)
                
                ],remainder='drop')
        #   populating parameter grid to search
        grid = [
            {
            'classifier' : [LogisticRegression()],
            #'classifier__penalty' : ['l1', 'l2'],
            #'classifier__C' : np.logspace(-4, 4, 20),
            'classifier__solver' : ['liblinear']}
            ]

        pipe = Pipeline(
            steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(grid)),
            ],
            )


        
        grid_search = GridSearchCV(pipe, param_grid=grid, verbose=2, return_train_score=True)
        grid_search.fit(x_train,y_train)

        print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
        print(grid_search.best_params_)


        if i not in evaluations.keys():
            evaluations[i] = {
                'grid_search':grid_search,
                'x_train':x_train,
                'y_train':y_train,
                'x_test':x_test,
                'y_test':y_test
                
            }


    return evaluations


evaluations = grid_search_pipe()

In [None]:
def make_model(evaluations):


    targets = ['Q1','Q2','Q3','Q4']
    X = data.drop(targets, axis=1)

    test_results = {}

    for i in targets:
        evaluation = evaluations[i]['grid_search']
        params = evaluation.best_params_


        y = data[i]
        x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)



        le_cols = ['company', 'job_title', 'state', 'city']
        scal_cols = ['rating']

        preprocessor = ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'comment_text_lem'),
                ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
                ('scaler', MinMaxScaler(), scal_cols)
                
                ],remainder='drop')


        pipe = Pipeline(
            steps=[
                ('preprocessor', preprocessor),
                ('classifier', params['classifier']),
            ],
        )


        pipe.fit(x_train,y_train)

        predictions = pipe.predict(x_test)
        score = pipe.score(x_train, y_train)


        ### check
        check = pd.DataFrame(preprocessor.fit_transform(x_train,y_train).toarray())


        if i not in test_results.keys():
            test_results[i] = pipe, x_train, y_train, x_test, y_test, predictions, score
                
    return test_results, check
        
test_results, check = make_model(evaluations)


In [None]:
def combinator(test_results):
    targets = ['Q1','Q2','Q3','Q4']

    resultsDB = {}
    lst = []

    
    for label in targets:
        data_dict = {}
        pipe = test_results[label][0]
        x_test = test_results[label][3]
        y_test = test_results[label][4]
        y_prob = pipe.predict_proba(x_test)
        predict_y = pipe.predict(x_test)
        data_dict = {
            f'{label}_y_test':y_test, 
             f'{label}_pred y': predict_y.tolist(),
             f'{label}_probabilities': y_prob.tolist()
             }
        zulu = x_test.join(pd.DataFrame(data_dict))
        if label not in resultsDB.keys():
            resultsDB[label] = zulu
        else:
            continue


        out = pd.DataFrame(data_dict)
        lst.append(out)

    results_table = x_test
    for i, label in zip(lst, targets):
        results_table = results_table.join(i, lsuffix=label)
    #results_table = results_table.join(lst[1], lsuffix='__')
    #for label, results in zip(targets,lst):
    #    results_table = results_table.join(results, lsuffix=label)
    return resultsDB, results_table, lst
   

data_dict, results_table, lst = combinator(test_results)

In [None]:
len(test_results['Q1'])

In [None]:
pipe = test_results['Q1'][0]
pipe

In [None]:
x_train = test_results['Q1'][1]
x_train

In [None]:
y_train = test_results['Q1'][2]
y_train

In [None]:
x_test = test_results['Q1'][3]
x_test

In [None]:
y_test = test_results['Q1'][4]
y_test

In [None]:
predictions =  test_results['Q1'][5]
predictions

In [None]:
score =  test_results['Q1'][6]
score

In [None]:
score = pipe.score(x_train, y_train)
score

In [None]:

predictions = pipe.predict(x_test)
predictions

In [None]:
print("Accuracy :",accuracy_score(y_test, predictions))

In [None]:

print("Hamming loss ",hamming_loss(y_test,predictions))

In [None]:

precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

In [None]:
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

In [None]:
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
print("\nClassification Report")
print (classification_report(y_test, predictions))

In [None]:

pipe.score(x_train, y_train)

In [None]:
cf1 = confusion_matrix(y_test, predictions)

In [None]:
Q = ['Q1']
te_confusions = [cf1]
y_t = [y_test]
te_probs = [pipe.predict_proba(x_test)]
y_pred = [predictions]
y_test_preds = predictions
y_prob = [pipe.predict_proba(x_test)]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names=['0', '1']))

In [None]:
import sklearn.metrics as metrics# calculate the fpr and tpr for all thresholds of the classification
probs = pipe.predict_proba(x_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-.01, 1.])
plt.ylim([-.01, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
for title, cf, y_t, y_pred, y_prob in zip(Q, te_confusions, y_test, y_test_preds, te_probs):
    make_confusion_matrix(cf, title='\n'+title+' Confusion Matrix\n')


In [None]:
results_table

In [None]:
def Q(row):
    """Subtracts the distance (in days) provided by the postD function.
    Args:
        row (record in the data): record

    Returns:
        datetime object: extract date - relative age = actual post date
    """
    #np.argmax(row[['Q1_Prob', 'Q2_Prob','Q3_Prob','Q4_Prob']].values)
    Q1 = row['Q1_probabilities'][1]
    Q2 = row['Q2_probabilities'][1]
    Q3 = row['Q3_probabilities'][1]
    Q4 = row['Q4_probabilities'][1]
    lst = [Q1,Q2,Q3,Q4]
    # adding 1 so these correspond to the target names rather than 0 indexed.
    return int(lst.index(max(lst))+1)
    #next, check if that 


results_table['Q'] = results_table.apply( lambda row : Q(row), axis = 1)
results_table


In [None]:

ohe = OneHotEncoder(sparse=False)
#results_table = results_table.join(pd.DataFrame(ohe.fit_transform(results_table[['Q']]), columns=['Q1','Q2','Q3','Q4']))


In [None]:
ohe = ohe.fit_transform(results_table[['Q']]).T

results_table['Q1'] = ohe[0]
results_table['Q2'] = ohe[1]
results_table['Q3'] = ohe[2]
results_table['Q4'] = ohe[3]

results_table

In [None]:
results_table.columns

In [None]:
#TODO CLean out all these nans Way upstream

In [None]:
results_table.to_csv(f'../app/data/tableau_table.csv', index=False)


In [None]:
results_table=results_table[results_table.salary.notna()]
results_table[results_table.Q != results_table.target]

In [None]:
results_table.info()

In [None]:
q1 = np.quantile(results_table.salary, 0.25)
q2 = np.quantile(results_table.salary, 0.5)
q3 = np.quantile(results_table.salary, 0.75)

# calc iqr
iqr = (q3 - q1)
# expand iqr to discern outliers
iqr_x = iqr*1.5

# setting the lower and upper limits
iqr_lower = q1-iqr_x
iqr_upper = q3+iqr_x


sns.displot(results_table.salary)
plt.axvline(x=q1, label="Q1", c = 'g')
plt.axvline(x=q2, label="Q2", c = '#fd4d3f')
plt.axvline(x=q3, label="Q3", c = 'r')

plt.axvline(x=iqr_lower, label = 'IQR Lower', c = 'black')
plt.axvline(x=iqr_upper, label = 'IQR Upper', c = 'black')
plt.xticks(rotation=30)
plt.legend()
plt.show()

In [None]:
results_table.Q.value_counts()

In [None]:
g = sns.catplot(x="Q", y="salary", kind="violin", inner=None, data=results_table)
sns.swarmplot(x="Q", y="salary", color="k", size=2, data=results_table, ax=g.ax)

plt.xticks(rotation=0)

plt.show()

In [None]:
city------------
colmax == 'q3_PROB', probably taken from col name
Company------------
Date Posted----------------
Description
Focus
Job title
JobUrl
Location
Q
Requirements
Role
Schedule
State
Summary
Probability
Q1 = 0 or 1 or null
Q1_posts = 1.0 or 0.0
Q1 pred = binary
Q1 probs

etc

Salary


top terms:
Feature = values == term
Q = class == 'Q1', etc
Imortance == some float

top states:
the same but Feature = states, capitalized

In [None]:

X_tfidf = tfidf.fit_transform(_test).toarray()
vocab = q1_vectorizer.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

feature_names = tfidf.get_feature_names()
df_tfidf = pd.DataFrame(X_tfidf, columns = feature_names)

idx = X_tfidf.argsort(axis=1)

tfidf_max10 = idx[:,-10:]

df_tfidf['top10'] = [[reverse_vocab.get(item) for item in row] for row in tfidf_max10 ]

df_tfidf['top10']
