In [38]:
import pandas as pd
import numpy as np
import nltk
import string



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from joblib import load, dump

# Load Data and Replace values with 1 and 0

In [2]:
train_df = pd.read_csv("training_data.csv")

In [3]:
train_df['Sentiment'] = train_df['Sentiment'].replace({'Positive': 1, 'Negative': 0})

In [4]:
x_train = train_df["Text"]
y_train = train_df['Sentiment']
corpus = x_train

# Fit Tf-Idf Vectorizer

In [5]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

# Perform Gridsearch

In [12]:
log_model = LogisticRegression(verbose = 0, warm_start = True, max_iter = 1000) # create logistic model
model = Pipeline([('Tfidf', tfidf_vectorizer), ("log_reg", log_model)]) # create pipeline

# parameters to search with grid search
params = {
    'log_reg__max_iter' : (200, 500, 1000, 2000),
    'Tfidf__max_df': (0.5, 0.75, 1.0),
    'Tfidf__min_df': (1, 2, 5),
    'Tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'Tfidf__max_features': (None, 5000, 10000, 20000),
}

grid_search = GridSearchCV(model, params, cv = 5, verbose = 1, n_jobs = 1) # create gridsearch and fit 
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [None]:
'''
Cell for saving the model will only run if gridseach is actually perfomed
Otherwise just load the model like below
'''

# best_model = grid_search.best_estimator_
# dump(best_model, 'LR_GridSearchCV.joblib')

# Load Model

In [18]:
# load in case kernel resets and model is lost in memory
grid_search = load("LR_GridSearchCV.joblib")

# Training Accuracy

In [7]:
y_grid_train = grid_search.predict(x_train)
train_grid_acc = accuracy_score(y_train, y_grid_train)
train_grid_acc

0.9735909644557634

# Testing Metrics

In [19]:
test_df = pd.read_csv("testing_data.csv")

In [20]:
test_df['Sentiment'] = test_df['Sentiment'].replace({'Positive': 1, 'Negative': 0}) # switch labels to 0 and 1

In [21]:
# Get test data and predict with model
x_test = test_df["Text"]
y_test = test_df["Sentiment"]

y_pred_grid = grid_search.predict(x_test)

In [32]:
# Testing metrics

# Accuracy
grid_test_acc = accuracy_score(y_test, y_pred_grid)
print(f"Testing accuracy: {grid_test_acc}")

# Precision
grid_test_prec = precision_score(y_test, y_pred_grid)
print(f"Precision: {grid_test_prec}")

# Recall
grid_test_recall = recall_score(y_test, y_pred_grid)
print(f"Recall: {grid_test_recall}")

# F1
grid_test_f1 = f1_score(y_test, y_pred_grid)
print(f"F1: {grid_test_f1}")

# Confusion Matrix
grid_test_cm = confusion_matrix(y_test, y_pred_grid)
print("Confusion Matrix:")
print(grid_test_cm)

Testing accuracy: 0.9758812615955473
Precision: 0.9888059701492538
Recall: 0.9636363636363636
F1: 0.9760589318600369
Confusion Matrix:
[[261   3]
 [ 10 265]]


In [29]:
sentence = "" # insert sentence to test and run both blocls be

In [30]:
ex = grid_search.predict([sentence])
if ex[0] == 1:
    sentiment = "Positive :)"
else:
    sentiment = "Negative :("

print(sentiment)
print(sentence)

In [None]:
# Shows parameters of model
log_reg_params = grid_search.get_params()
for param in log_reg_params:
    print(param,":" ,log_reg_params[param])