In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
set_config(display='diagram')


from sklearn.linear_model import LogisticRegression

import re

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['font.size'] = 17



import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%matplotlib inline

In [None]:
data = pd.read_csv(f'../app/data/processed_data.csv')


In [None]:
data.text[0]

In [None]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


In [None]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|(0-9)]',r'',clean_sentence.strip(' '))
        if len(clean_sentence.strip()) > 1:
            clean_sentences.append(clean_sentence)
    done_sent = ''
    for i in clean_sentences:
        done_sent += (' '+i)
    return done_sent.strip()
    

data['cleaned'] = data.text.apply(clean_text)

In [None]:
data.cleaned[0]

In [None]:
def clean_text(data):
    sentences = data.split('.')
    clean_sentences = []
    for i in sentences:
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',i)
        clean_sentence = re.sub(r'[?|!|\'|"|#|/|-|,(|)|$|-|' '|:]',r'',clean_sentence)
        clean_sentence = " ".join(re.findall("[(a-zA-Z,&)]+", clean_sentence))
        clean_sentences.append(clean_sentence)
    clean_text = ''
    for i in clean_sentences:
        clean_text += (' '+i)
    return clean_text.strip(' ')

    

data['cleaned'] = data.text.apply(clean_text)



In [None]:
data.cleaned[3]

### 4.1 Target
Before I can begin splitting the data I need to set the target for my methodology of training four seperate logistic regression models. I'm doing this because I'd like my classifications to be as accurate as possible, and also, by building my NLP strategy around a particular label, i.e. finding common words for that label as opposed to being generalized through the entire corpus\

I'm going to one-hot-encode the target feature so I can select each of the next columns as my y - one for each model.

In [None]:
data = data[['company','job_title','state','city','rating','cleaned','target']]

In [None]:
ohe = OneHotEncoder(sparse=False, dtype='int')
targets = ohe.fit_transform(pd.DataFrame(data.target))


In [None]:
targets = pd.DataFrame(targets,columns=['Q1','Q2','Q3','Q4','unk'])


In [None]:
data = data.join(targets)
data.head()

In [None]:
data.drop(['target','unk'], axis=1,inplace=True)

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


data['comment_text'] = data['cleaned'].apply(stemming)

In [None]:
data.comment_text[0]

In [None]:
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
def stemming(sentence):
    LemSentence = ""
    for word in sentence.split():
        stem = lemmatizer.lemmatize(word)
        LemSentence += stem
        LemSentence += " "
    LemSentence = LemSentence.strip()
    return LemSentence


data['comment_text_lem'] = data['cleaned'].apply(stemming)
data.comment_text_lem[0]

In [None]:
data.drop(['cleaned','comment_text'], axis=1,inplace=True)

In [None]:
data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
q1_data = data.drop(['Q2','Q3','Q4'], axis=1)
q2_data = data.drop(['Q1','Q3','Q4'], axis=1)
q3_data = data.drop(['Q1','Q2','Q4'], axis=1)
q4_data = data.drop(['Q1','Q2','Q3'], axis=1)

In [None]:
data

In [None]:
le_cols = data.select_dtypes('object').columns
scal_cols = ['rating']



preprocessor = ColumnTransformer(
     transformers=[
         ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'comment_text_lem'),
         ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
         ('scaler', StandardScaler(), scal_cols)
         
         ])




pipe = Pipeline(
     steps=[
         ('preprocessor', preprocessor),
         ('classifier', LogisticRegression()),
     ],
 )

In [None]:
def multi_pipe():
    targets = ['Q1','Q2','Q3','Q4']
    X = data.drop(targets, axis=1)

    evaluations = {}
    for i in targets:
        y = data[i]
        x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)

        grid = [
        {
        'classifier' : [LogisticRegression()],
        'classifier__penalty' : ['l1', 'l2'],
        #'classifier__C' : np.logspace(-4, 4, 20),
        'classifier__solver' : ['liblinear']}
]

        
        grid_search = GridSearchCV(pipe, param_grid=grid, verbose=2, return_train_score=True,n_jobs=-1)
        grid_search.fit(x_train,y_train)


        if i not in evaluations.keys():
            evaluations[i] = {
                'grid_search':grid_search,
                'x_train':x_train,
                'y_train':y_train,
                'x_test':x_test,
                'y_train':y_train
                
            }


    return evaluations


evaluations = multi_pipe()

In [None]:
evaluations.keys()

In [None]:
evaluations['Q1'].keys()

In [None]:
pd.DataFrame(evaluations['Q3']['y_train'])

In [None]:
X = q1_data.drop(['Q1'], axis=1)
y = q1_data['Q1']




x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=.2, random_state=42)


In [None]:
le_cols = x_train.select_dtypes('object').columns
scal_cols = ['rating']



preprocessor = ColumnTransformer(
     transformers=[
         ('text', TfidfVectorizer(ngram_range=(1,3), analyzer = 'word',max_features=1000,stop_words='english',decode_error='ignore'), 'comment_text_lem'),
         ('category', OneHotEncoder(handle_unknown ='ignore'), le_cols),
         ('scaler', StandardScaler(), scal_cols)
         
         ])




pipe = Pipeline(
     steps=[
         ('preprocessor', preprocessor),
         ('classifier', LogisticRegression()),
     ],
 )

In [None]:
pipe.fit(x_train,y_train)

In [None]:
grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']}
]


In [None]:

grid_search = GridSearchCV(pipe, param_grid=grid, verbose=2, return_train_score=True,n_jobs=-1)
grid_search.fit(x_train,y_train)

#1.26.6


In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.score(x_train, y_train)

In [None]:
grid_search.score(x_test, y_test)

In [None]:
grid_search.predict(x_test)

In [None]:
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

In [None]:

pipe

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss,precision_score,recall_score,f1_score,classification_report
predictions = pipe.predict(x_test)

In [None]:
print("Accuracy :",accuracy_score(y_test, predictions))

In [None]:

print("Hamming loss ",hamming_loss(y_test,predictions))

In [None]:
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

In [None]:
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

In [None]:
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [None]:
print("\nClassification Report")
print (classification_report(y_test, predictions))

In [None]:
predictions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, roc_auc_score, roc_curve
from confusion import make_confusion_matrix

In [None]:
cf1 = confusion_matrix(y_test, predictions)

In [None]:
Q = ['Q1']
te_confusions = [cf1]
y_t = [y_test]
te_probs = [pipe.predict_proba(x_test)]
y_pred = [predictions]
y_test_preds = predictions
y_prob = [pipe.predict_proba(x_test)]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names=['0', '1']))

In [None]:
import sklearn.metrics as metrics# calculate the fpr and tpr for all thresholds of the classification
probs = pipe.predict_proba(x_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-.01, 1.])
plt.ylim([-.01, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
for title, cf, y_t, y_pred, y_prob in zip(Q, te_confusions, y_test, y_test_preds, te_probs):
    make_confusion_matrix(cf, title='\n'+title+' Confusion Matrix\n')


In [None]:
pipe.classes_

In [None]:
predict_y = pipe.predict(x_test)
data_dict = {"y":y_test, 
             "pred y": predict_y.tolist(),
             "probabilities": y_prob.tolist()}
pd.DataFrame(data_dict).tail(10)
pd.DataFrame(data_dict)

In [None]:
zulu = x_test.join(pd.DataFrame(data_dict))
zulu

In [None]:
x_test

In [None]:

X_tfidf = tfidf.fit_transform(_test).toarray()
vocab = q1_vectorizer.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

feature_names = tfidf.get_feature_names()
df_tfidf = pd.DataFrame(X_tfidf, columns = feature_names)

idx = X_tfidf.argsort(axis=1)

tfidf_max10 = idx[:,-10:]

df_tfidf['top10'] = [[reverse_vocab.get(item) for item in row] for row in tfidf_max10 ]

df_tfidf['top10']
