# ENGR 891: Programming Assignment #3
## Part B: 
### Pre-processing

In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 

from sklearn.pipeline import Pipeline

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package wordnet to /Users/jing/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Get train and test data separtely

In [2]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True,random_state=42)
X_train = train_data.data
y_train = train_data.target
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
X_test = test_data.data
y_test = test_data.target

### Lemmatizer train and test data separtely

In [3]:
lemmatizer = WordNetLemmatizer()

X_train = list(map(lambda text: (' '.join(lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(text.lower()))), X_train))

X_test = list(map(lambda text: (' '.join(lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(text.lower()))), X_test))



### Check if words are lemmatized (data list)

In [4]:
print(len(X_train))
print(X_train[0])

2257
from : sd345 @ city.ac.uk ( michael collier ) subject : converting image to hp laserjet iii ? nntp-posting-host : hampton organization : the city university line : 14 doe anyone know of a good way ( standard pc application/pd utility ) to convert tif/img/tga file into laserjet iii format . we would also like to do the same , converting to hpgl ( hp plotter ) file . please email any response . is this the correct group ? thanks in advance . michael . -- michael collier ( programmer ) the computer unit , email : m.p.collier @ uk.ac.city the city university , tel : 071 477-8000 x3769 london , fax : 071 477-8565 ec1v 0hb .


## Experiment 6 Multinomial NB

### Building a Pipeline for Hyperparameter Tuning & Feature Vectorization

In [5]:
text_clf_multinomialNB = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', MultinomialNB()),
    ])

### Model selection: Hyperparameter tuning

In [6]:
%%time
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': ['english', None],
    'clf__alpha': [0.0001, 0.001, 0.1, 1.0, 1.5, 2.0],
}


clf_multinomial_cv = GridSearchCV(text_clf_multinomialNB, param_grid, scoring='f1_micro', cv=5, verbose=1, n_jobs=-1)

clf_multinomial_cv = clf_multinomial_cv.fit(X_train, y_train)


params_optimal_clf_multinomial = clf_multinomial_cv.best_params_

print("\nBest Score: %f" % clf_multinomial_cv.best_score_)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, params_optimal_clf_multinomial[param_name]))
    
print("\n")


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   30.3s finished



Best Score: 0.981391
clf__alpha: 0.001
vect__ngram_range: (1, 2)
vect__stop_words: 'english'


CPU times: user 3.75 s, sys: 614 ms, total: 4.36 s
Wall time: 31.8 s


### Train the model

In [7]:
multinomialNB_clf = Pipeline([
        ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2), binary=False)),
        ('clf', MultinomialNB(alpha=0.001)),
    ])

multinomialNB_clf.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('clf', MultinomialNB(alpha=0.001))])

### Evaluate the model on test data

In [12]:
print("Training Accuracy: ", multinomialNB_clf.score(X_train, y_train))

print("\nTest Accuracy: ", multinomialNB_clf.score(X_test, y_test))

y_test_predicted = multinomialNB_clf.predict(X_test)

print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted))

Training Accuracy:  1.0

Test Accuracy:  0.9387483355525965

Test Confusion Matrix:
[[296   4   5  14]
 [  5 367  15   2]
 [  3  17 363  13]
 [  5   2   7 384]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       319
           1       0.94      0.94      0.94       389
           2       0.93      0.92      0.92       396
           3       0.93      0.96      0.95       398

    accuracy                           0.94      1502
   macro avg       0.94      0.94      0.94      1502
weighted avg       0.94      0.94      0.94      1502



## Experiment 7: Support Vector Machine (LinearSVC)
### Building a Pipeline for Hyperparameter Tuning & Feature Vectorization

In [13]:
svm_linearsvc = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(loss='hinge', random_state=42)),
    ])

### Hyperparameter Tuning for LinearSVC Model

In [14]:
%%time
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': ['english', None],
    'clf__C': [0.1, 1, 5, 10],
}

svm_linearsvc_cv = GridSearchCV(svm_linearsvc, param_grid, scoring='accuracy', cv=5)

svm_linearsvc_cv = svm_linearsvc_cv.fit(X_train, y_train)


print("\nBest Score: %f" % svm_linearsvc_cv.best_score_)

print("\nOptimal Hyperparameter Values: ")

for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, svm_linearsvc_cv.best_params_[param_name]))


Best Score: 0.977403

Optimal Hyperparameter Values: 
clf__C: 5
vect__ngram_range: (1, 2)
vect__stop_words: 'english'
CPU times: user 2min 6s, sys: 2.81 s, total: 2min 9s
Wall time: 2min 10s


### Train the Optimal LinearSVC Model 

In [15]:
%%time
svm_linearsvc = Pipeline([
        ('vect', CountVectorizer(stop_words= 'english', ngram_range=(1, 2), binary=False)),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(loss='hinge', C=5, random_state=42)),
    ])

svm_linearsvc.fit(X_train, y_train)

CPU times: user 2.64 s, sys: 54.7 ms, total: 2.69 s
Wall time: 2.7 s


Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC(C=5, loss='hinge', random_state=42))])

### Evaluate LinearSVC Model on Test Data

In [16]:
%%time

y_test_predicted = svm_linearsvc.predict(X_test)

print("Training Accuracy: ", svm_linearsvc.score(X_train, y_train))

print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))

print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

precision_test = precision_score(y_test, y_test_predicted, average='micro') 
print("\nTest Precision = %f" % precision_test)

recall_test = recall_score(y_test, y_test_predicted, average='micro')
print("\nTest Recall = %f" % recall_test)

f1_test = f1_score(y_test, y_test_predicted, average='micro')
print("\nTest F1 Score = %f" % f1_test)

print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted))

Training Accuracy:  1.0

Test Accuracy:  0.9320905459387483

Test Confusion Matrix:
[[275   8   8  28]
 [  4 377   6   2]
 [  6  24 364   2]
 [  4   7   3 384]]

Test Precision = 0.932091

Test Recall = 0.932091

Test F1 Score = 0.932091

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90       319
           1       0.91      0.97      0.94       389
           2       0.96      0.92      0.94       396
           3       0.92      0.96      0.94       398

    accuracy                           0.93      1502
   macro avg       0.93      0.93      0.93      1502
weighted avg       0.93      0.93      0.93      1502

CPU times: user 1.37 s, sys: 20.9 ms, total: 1.39 s
Wall time: 1.39 s


## Experiment 8: Support Vector Machine (SVC with RBF Kernel)
### Building a Pipeline for Hyperparameter Tuning & Feature Vectorization

In [17]:
svm_svc_rbf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(kernel='rbf')),
    ])

### Hyperparameter Tuning for SVC (RBF Kernel) Model

In [19]:
%%time
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': ['english', None],
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': [0.1, 0.01, 0.001],
}

svm_svc_rbf_cv = GridSearchCV(svm_svc_rbf, param_grid, scoring='accuracy', cv=5, n_jobs=-1)

svm_svc_rbf_cv = svm_svc_rbf_cv.fit(X_train, y_train)

print("\nBest Score: %f" % svm_svc_rbf_cv.best_score_)

print("\nOptimal Hyperparameter Values: ")

for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, svm_svc_rbf_cv.best_params_[param_name]))


Best Score: 0.977402

Optimal Hyperparameter Values: 
clf__C: 100
clf__gamma: 0.01
vect__ngram_range: (1, 2)
vect__stop_words: 'english'
CPU times: user 15.7 s, sys: 972 ms, total: 16.6 s
Wall time: 6min 41s


### Train the Optimal SVC (RBF Kernel) Model 

In [20]:
%%time
svm_svc_rbf = Pipeline([
        ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2), binary=False)),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(kernel='rbf', C=100, gamma=0.01, random_state=42)),
    ])

svm_svc_rbf.fit(X_train, y_train)

CPU times: user 10.7 s, sys: 71.2 ms, total: 10.7 s
Wall time: 10.7 s


Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(C=100, gamma=0.01, random_state=42))])

### Evaluate SVC (RBF Kernel) Model on Test Data

In [21]:
%%time

y_test_predicted = svm_svc_rbf.predict(X_test)

print("Training Accuracy: ", svm_svc_rbf.score(X_train, y_train))
      
print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))

print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

precision_test = precision_score(y_test, y_test_predicted, average='micro') 
print("\nTest Precision = %f" % precision_test)

recall_test = recall_score(y_test, y_test_predicted, average='micro')
print("\nTest Recall = %f" % recall_test)

f1_test = f1_score(y_test, y_test_predicted, average='micro')
print("\nTest F1 Score = %f" % f1_test)


print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted))

Training Accuracy:  1.0

Test Accuracy:  0.9274300932090546

Test Confusion Matrix:
[[272   9   7  31]
 [  1 376   8   4]
 [  5  26 362   3]
 [  4   8   3 383]]

Test Precision = 0.927430

Test Recall = 0.927430

Test F1 Score = 0.927430

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.85      0.91       319
           1       0.90      0.97      0.93       389
           2       0.95      0.91      0.93       396
           3       0.91      0.96      0.94       398

    accuracy                           0.93      1502
   macro avg       0.93      0.92      0.93      1502
weighted avg       0.93      0.93      0.93      1502

CPU times: user 12 s, sys: 69.5 ms, total: 12.1 s
Wall time: 12.2 s


### Q-5) Between linear SVM and nonlinear SVM, which classifier performed the best on test data? Explain why.

The Linear SVM has slightly better performance compared to nonlinear SVM model because text classification data are linearly separable. Other than that, the LinearSVC is faster than the SVC with RBF Kernel. Because it scales linearly with the number of training instances and the number of features, unlike the kernelized SVM that solves the dual problem whose complexity is quadratic (at best) with the number of instances.