In [4]:
# in this file we are goona work on the text data, extract features from text and build models
# Now our data cleaning has almost been done. It's time to extract more features :
# 1. n-grams
# 2. tf-idf
# 3. bag of words


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# let's do some moedelling
from sklearn.model_selection import train_test_split, cross_val_score

# packages for metric for evalaution of the models
from sklearn import metrics
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# As tf–idf is very often used for text features, there is also another class called TfidfVectorizer that combines 
# all the options of CountVectorizer and TfidfTransformer in a single model.
# so we can calculate tf-idf and the CountVectorizer in one go with below module
from sklearn.feature_extraction.text import TfidfVectorizer

# use multiple models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

import seaborn as sns

# to save the models so that we don't need to train the models every time we need to do prediction
from sklearn.externals import joblib




In [5]:
df = pd.read_csv(r'C:\Users\Adarsh\uOttawa\ML Course Project-7Apr\datasets\4a-after-text-cleaning.csv')

In [6]:
df.head(5)

Unnamed: 0,feedback,job_status,feedback_word_count,avg_word_len,sentiment_score,sentiment
0,,0,88,4.534091,0.124329,1
1,,0,104,4.336538,0.448133,1
2,,0,151,4.516556,0.37381,1
3,,0,153,4.895425,0.199087,1
4,,0,27,6.37037,0.48,1


In [6]:
# calculating tf-idf using scikitlearn
# sublinear_df  - is set to True to use a logarithmic form for frequency.
# min_df - is the minimum numbers of documents a word must be present in to be kept.
# norm - is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range -  is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words - is set to "english" to remove all common pronouns ("a", "the", ...) to reduce the number of noisy features.


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=50, norm='l2', encoding='latin-1', ngram_range=(1, 3))

features = tfidf.fit_transform(df.feedback.values.astype('str')).toarray()
labels = df.job_status

In [7]:
features.shape

(40000, 1)

In [8]:
features

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [9]:
# convert features ndarray into pandas dataframe so that it can be merged with 
#other features like avg_word_count and avg_word_len
features_df = pd.DataFrame(features)

In [10]:
features_df.tail(5)

Unnamed: 0,0
39995,1.0
39996,1.0
39997,1.0
39998,1.0
39999,1.0


In [11]:
df_word = df[['feedback_word_count' , 'avg_word_len']]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_word, labels, random_state = 0)

In [13]:
# let's try how ensemling ( Random forest wit 10 decision trees ) performs 
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
print("Accuracy of the random forest with 10 DTs, with only 2 features considered - word count and world length")
print("Accuracy = ")
print(random_forest.score(X_test, y_test))
print("\n")
y_pred_rf = random_forest.predict(X_test)
print(classification_report(y_test, y_pred_rf))

Accuracy of the random forest with 10 DTs, with only 2 features considered - word count and world length
Accuracy = 
0.596


              precision    recall  f1-score   support

           0       0.60      0.60      0.60      5014
           1       0.60      0.59      0.59      4986

   micro avg       0.60      0.60      0.60     10000
   macro avg       0.60      0.60      0.60     10000
weighted avg       0.60      0.60      0.60     10000



In [15]:
df_word.shape

(40000, 2)

In [16]:
type(df_word)

pandas.core.frame.DataFrame

In [17]:
df_word.head(5)

Unnamed: 0,feedback_word_count,avg_word_len
0,88,4.534091
1,104,4.336538
2,151,4.516556
3,153,4.895425
4,27,6.37037


In [18]:
final_features = pd.concat([features_df, df_word], axis=1)

In [19]:
final_features.shape

(40000, 3)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, random_state = 0)

In [21]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# define cross validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, final_features, labels, scoring='accuracy', cv=CV)
    
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()




model_name
LinearSVC                 0.497875
LogisticRegression        0.511650
MultinomialNB             0.511975
RandomForestClassifier    0.511250
Name: accuracy, dtype: float64

In [22]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.50375
1,RandomForestClassifier,1,0.51575
2,RandomForestClassifier,2,0.513375
3,RandomForestClassifier,3,0.5185
4,RandomForestClassifier,4,0.504875
5,LinearSVC,0,0.496875
6,LinearSVC,1,0.5
7,LinearSVC,2,0.4925
8,LinearSVC,3,0.5
9,LinearSVC,4,0.5


In [23]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(final_features, labels, 
                                                                                 df.index, test_size=0.25, random_state=0)


In [24]:
# As LogisticRegression() works better than other let's look deeper
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
y_pred = lr.predict(X_test)

#conf_mat = confusion_matrix(y_test, y_pred)


print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.69      0.59      5014
           1       0.52      0.34      0.41      4986

   micro avg       0.51      0.51      0.51     10000
   macro avg       0.51      0.51      0.50     10000
weighted avg       0.51      0.51      0.50     10000



In [26]:
# save the logistic regression model
filepath = r'C:\Users\Adarsh\uOttawa\ML Course Project-7Apr\saved_models\text-data\LogisticRegression_model'
joblib.dump(lr ,filepath )


['C:\\Users\\Adarsh\\uOttawa\\ML Course Project-7Apr\\saved_models\\text-data\\LogisticRegression_model']