In [1]:
# Import necessary libraries
import pickle
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
# Load train data into memory
with open("pickle_data/text_train.pickle", "rb") as f:
    text_train = pickle.load(f)

with open("pickle_data/sent_train.pickle", "rb") as f:
    sent_train = pickle.load(f)

In [3]:
# Preprocess the data
corpus_train = []
for text in text_train:
    text = text.replace("<br /><br />", " ") # Replace <br /><br /> with space
    text = re.sub(r'\W', ' ', text) # Replace punctuations with space
    text = text.lower() # Conveter to lower case
    text = re.sub(r'\s+[a-z]\s+', ' ', text) # Replace single characters with space
    text = re.sub(r'^[a-z]\s+', ' ', text) # Replace single characters at the beginning of the sentencecs with space
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    corpus_train.append(text)

In [4]:
# Check to see the results of preprocessing
corpus_train[0]

'zero day leads you to think even re think why two boys young men would do what they did commit mutual suicide via slaughtering their classmates it captures what must be beyond bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own mutual world via coupled destruction it is not perfect movie but given what money time the filmmaker and actors had it is remarkable product in terms of explaining the motives and actions of the two young suicide murderers it is better than elephant in terms of being film that gets under our rationalistic skin it is far far better film than almost anything you are likely to see flawed but honest with terrible honesty '

In [5]:
# Convert string to TF-IDF model
vectorizer = TfidfVectorizer(
    max_features = 5000,
    min_df = 3,
    max_df = 0.6,
    stop_words=stopwords.words('english'),
)

corpus_train_tfidf = vectorizer.fit_transform(corpus_train).toarray()

In [6]:
corpus_train_tfidf.shape

(25000, 5000)

We will compare the performances of 5 baseline classification algorithms, which are:
* Logistic Regrssion
* Linear Discriminant Analysis
* Decision Tree
* Gaussian Naive Bayes
* Linear SVC

In [7]:
"""
Baseline algorithms
"""

# Parameters
n_folds = 10 # 10 folds cross validation
seed = 23 # To repeat the results

# Baseline algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier())) 
models.append(('NB', GaussianNB())) 
models.append(('SVM', LinearSVC()))

results = []
names = []
for name, model in models:
    cv = KFold(n_splits=n_folds, random_state=seed)
    cv_results = cross_val_score(model, corpus_train_tfidf, sent_train, cv=cv, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean()} ({cv_results.std()})")   

LR: 0.8842399999999999 (0.005797102724637549)
LDA: 0.85908 (0.006743411599479888)
CART: 0.71296 (0.006541437150963073)
NB: 0.8002800000000001 (0.009384327360018947)
SVM: 0.8746 (0.006470239562798274)


We can see that Logistic Regression and Linear SVC perform better than the other algorithms.

In [8]:
# Tuning the parameters of Logistic Regression
c_values = [0.01, 0.05, 0.25, 0.5, 1]
param_grid = dict(C=c_values)
model = LogisticRegression()
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv)
grid_result = grid.fit(corpus_train_tfidf, sent_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

Best: 0.88424 with {'C': 1}
0.84288 (0.007392536777047502) with: {'C': 0.01}
0.85904 (0.006993025096480058) with: {'C': 0.05}
0.87636 (0.006353455752580638) with: {'C': 0.25}
0.8814 (0.005722237324683412) with: {'C': 0.5}
0.88424 (0.005797102724637549) with: {'C': 1}


In [9]:
# Tuning the parameter of Linear SVC
c_values = [0.01, 0.05, 0.25, 0.5, 1]
param_grid = dict(C=c_values)
model = LinearSVC()
cv = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv)
grid_result = grid.fit(corpus_train_tfidf, sent_train)
print(f"Best: {grid_result.best_score_} with {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score'] 
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

Best: 0.88368 with {'C': 0.25}
0.86592 (0.006775957496915098) with: {'C': 0.01}
0.88252 (0.005297697613114576) with: {'C': 0.05}
0.88368 (0.003967064405829584) with: {'C': 0.25}
0.87888 (0.004094093306215664) with: {'C': 0.5}
0.8746 (0.006470239562798272) with: {'C': 1}


We can see that the best C value for Logistic Regression is 1 and the best C value for Linear SVC is 0.25.

In [10]:
# Final training of Logistic Regression
lr_final = LogisticRegression(C=1)
lr_final.fit(corpus_train_tfidf, sent_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# Final training of Linear SVC
linear_svc_final = LinearSVC(C=0.25)
linear_svc_final.fit(corpus_train_tfidf, sent_train)

LinearSVC(C=0.25, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Then we want to check the performance of the above two models on test set.

In [12]:
# Load test data into memory
with open("pickle_data/text_test.pickle", "rb") as f:
    text_test = pickle.load(f)

with open("pickle_data/sent_test.pickle", "rb") as f:
    sent_test = pickle.load(f)

In [13]:
# Preprocess the data
corpus_test = []
for text in text_test:
    text = text.replace("<br /><br />", " ") # Replace <br /><br /> with space
    text = re.sub(r'\W', ' ', text) # Replace punctuations with space
    text = text.lower() # Conveter to lower case
    text = re.sub(r'\s+[a-z]\s+', ' ', text) # Replace single characters with space
    text = re.sub(r'^[a-z]\s+', ' ', text) # Replace single characters at the beginning of the sentencecs with space
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    corpus_test.append(text)

In [14]:
corpus_test_tfidf = vectorizer.transform(corpus_test).toarray()

In [15]:
corpus_test_tfidf.shape

(25000, 5000)

In [16]:
# Show performance on test set
print(f"Accuracy using Logistic Regression on test set: {accuracy_score(sent_test, lr_final.predict(corpus_test_tfidf))}")
print(f"Accuracy using Linear SVC on test set: {accuracy_score(sent_test, linear_svc_final.predict(corpus_test_tfidf))}")

Accuracy using Logistic Regression on test set: 0.88076
Accuracy using Linear SVC on test set: 0.8778


Seems like that Logistic Regression performs a little better.

In [17]:
# Pickling the better model
with open("pickle_data/lr_classifier.pickle", "wb") as f:
    pickle.dump(lr_final, f)

# Pickling the vectorizer
with open("pickle_data/tfidf_vectorizer.pickle", "wb") as f:
    pickle.dump(vectorizer, f)

In the future, we can also investigate if ensemble methods can provide us even better results. Two kinds of ensemble methods can be investigated:
* Boosting Methods: AdaBoost (AB) and Gradient Boosting (GBM)
* Bagging Methods: Random Forests (RF) and Extra Trees (ET)