# DAT340 / DIT867 Applied Machine Learning
## Programming assignment 3c: Text classification (Part 3)


### Amr Mohamed
#### Exchange student from CY Tech - France to GU CSE Department

### Anh Thu DOAN
#### Exchange student from CY Tech - France to GU CSE Department

### Group PA3c 10

In [1]:
import pandas as pd
import numpy as np

import stanza
from scipy.stats import expon


# the actual classification algorithm
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# for splitting the dataset into training and test sets 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# for evaluating the quality of the classifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

from sklearn.inspection import permutation_importance

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'svg' 
plt.style.use('bmh')
plt.rcParams['image.cmap'] = 'Paired_r'

KeyboardInterrupt: 

In [None]:
train = pd.read_csv('PA3_train.tsv', sep='\t', names = ['annotation','review'], header=None) 
test = pd.read_csv('PA3_test_clean.tsv', sep='\t', names = ['annotation','review'], header=None) 

# Training Data Cleaning

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.review[1]

In [None]:
train[['annotation1','annotation2']]=train.annotation.str.split('/', expand=True)

In [None]:
train.head()

In [None]:
train.annotation.unique()

In [None]:
train.annotation1.unique()

In [None]:
train.annotation2.unique()

In [None]:
total_disagreement = round(len(train[~train.annotation.isin(['1/1', '0/0'])].annotation)/len(train.annotation),3)
print("The percentage of disagreement between the annotators is: "+ str(total_disagreement))

In [None]:
true_disagreement = round(len(train[train.annotation.isin(['1/0', '0/1'])].annotation)/len(train.annotation),3)
print("The percentage of real disagreement between the annotators is (0/1 or 1/0): "+ str(true_disagreement))

In [None]:
false_disagreement = round(len(train[~train.annotation.isin(['0/0', '1/1', '1/0', '0/1'])].annotation)/len(train.annotation),3)
print("The percentage of misannotated reviews led to\na difference in the annotations between the annotators is: "+ str(false_disagreement))

## Sentiment analysis on rows with an annotation disagreement

In [None]:
# Stanza library for sentiment analysis
nlp = stanza.Pipeline(lang='en', processors='tokenize, sentiment')

In [None]:
# collecting the sentiment scores for instances with disagreement
sent_scores = []
cpt=0
for index, row in train.iterrows():
    if row.annotation in ['1/0', '-1/0', '-1/1', '0/1', '2/1', '2/0', '1/','9/1']:
        doc = nlp(row.review)
        for sentence in doc.sentences:
            temp = []
            temp.append(int(sentence.sentiment)-1)
        sent_scores.append(sum(temp)/len(temp))
        print(cpt, end='\r')
    else:
        sent_scores.append(3)
    cpt+=1

In [None]:
train['sent_ana']=sent_scores
train.head(10)

In [None]:
train.shape

In [None]:
train=train[train.sent_ana!=0]

In [None]:
train.shape

In [None]:
train.sent_ana = train.sent_ana.replace(-1,0)

In [None]:
train.head(10)

In [None]:
# replacing the correctly annotated instances by their annotations after the sentiment analysi
train.loc[train['sent_ana'] == 3, 'sent_ana'] = train['annotation2']

In [None]:
train.sent_ana = train.sent_ana.astype(int)
train.head(10)

In [None]:
train_df=train[['review', 'sent_ana']]
train_df=train_df.rename(columns={"sent_ana": "annotation"})
train_df.annotation=pd.to_numeric(train_df.annotation)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
len(train_df[train_df.annotation == 1].annotation)

In [None]:
len(train_df[train_df.annotation == 0].annotation)

# Test data cleaning

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.annotation.unique()

In [None]:
test[test.annotation== 1 ].annotation.count()

In [None]:
test[test.annotation== 0 ].annotation.count()

In [None]:
# import re

In [None]:
# cleaning the text by converting all words in all the reviews to lowercase letters
def clean_text(text):
    # filter to allow only alphabets
#     text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    # remove Unicode characters
#     text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

train_df['review'] = train_df.review.apply(clean_text)
test['review'] = test.review.apply(clean_text)

In [None]:
# defining the training and testing sets
X_train = (train_df.review)
Y_train=train_df.annotation
X_test = (test.review)
Y_test=test.annotation

# ML models

## Dummy Classifier

In [None]:
dummy = DummyClassifier(strategy='most_frequent',random_state=0)
pipeline = make_pipeline( TfidfVectorizer(), dummy)
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

## Linear SVC

In [None]:
LSVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer()), 
    ('linearsvc',LinearSVC(random_state=0))])
LSVCpipeline.fit(X_train, Y_train)
Yguess = LSVCpipeline.predict(X_test)
accuracy_score(Y_test, Yguess)

## SVC

In [None]:
SVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer()), 
    ('svc',SVC(random_state=0))])
SVCpipeline.fit(X_train, Y_train)
accuracy_score(Y_test, SVCpipeline.predict(X_test))

## LogisticRegression

In [None]:
logRpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer()), 
    ('logr',LogisticRegression(random_state=0))])
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

## GradientBoostingClassifier

In [None]:
pipeline = make_pipeline( TfidfVectorizer(), GradientBoostingClassifier(random_state=0))
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

## RandomForestClassifier

In [None]:
pipeline = make_pipeline( TfidfVectorizer(), RandomForestClassifier(random_state=0))
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

## MultinomialNB

In [None]:
pipeline = make_pipeline( TfidfVectorizer(), MultinomialNB())
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

## MLPClassifier

In [None]:
NN = MLPClassifier(early_stopping=True,n_iter_no_change=10)
pipeline = make_pipeline( TfidfVectorizer(),NN)
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

In [None]:
NN = MLPClassifier(alpha = 0.1, max_iter=400,early_stopping=True,n_iter_no_change=3)
pipeline = make_pipeline( TfidfVectorizer(),NN)
pipeline.fit(X_train, Y_train)
accuracy_score(Y_test, pipeline.predict(X_test))

# Highest scoring models Tuning and evaluation (SVC and Linear SVC)

## Hyperparameter Tuning Linear SVC

In [None]:
#hyperparamaeter tuning
LSVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer()), 
    ('linearsvc',LinearSVC(random_state=0))])

param_grid = {'tfidf__max_df': [0.1,0.275,0.3545454545454545,0.3,0.5,0.75,1],
            'linearsvc__loss': ['hinge','squared_hinge'],
              'linearsvc__C': [0.01,0.1,1.0],
             'linearsvc__penalty': ['l1','l2'],
             'linearsvc__random_state': [0]}
 
LSVCgrid = GridSearchCV(LSVCpipeline, param_grid,n_jobs=-1)
 
# fitting the model for grid search
LSVCgrid.fit(X_train, Y_train)

In [None]:
LSVCgrid.best_params_

In [None]:
LSVCgrid.best_score_

In [None]:
accuracy_score(Y_test, LSVCgrid.predict(X_test))

In [None]:
#plotting the max_df tfidf parameter values over the accuracies of the linearsvc model

max_dfList=[]
for i in list(np.linspace(0.1,1,100)): 
    LSVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer(max_df=i)), 
    ('svc',LinearSVC(random_state=0))])
    LSVCpipeline.fit(X_train, Y_train)
    max_dfList.append(accuracy_score(Y_test, LSVCpipeline.predict(X_test)))

plt.plot(list(np.linspace(0.1,1,100)),max_dfList)
plt.xlabel('TF-IDF max document frequency')
plt.ylabel('LSVC accuracy score');

In [None]:
dict(zip(max_dfList,list(np.linspace(0.1,1,100))))

In [None]:
LSVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer(max_df=0.3545454545454545)), 
    ('linearsvc',LinearSVC(random_state=0))])
LSVCpipeline.fit(X_train, Y_train)
LSVC_Yguess = LSVCpipeline.predict(X_test)
accuracy_score(Y_test, LSVC_Yguess)

## Hyperparameter Tuning SVC

In [None]:
#hyperparamaeter tuning

param_grid = {'tfidf__max_df': [0.2,0.225,0.275,0.3],
              'svc__kernel': ['poly', 'rbf', 'sigmoid'],
              'svc__C': [0.3,0.5,0.7,1]}
SVCgs = GridSearchCV(SVCpipeline, param_grid, n_jobs=-1)
SVCgs.fit(X_train, Y_train);

In [None]:
SVCgs.best_params_

In [None]:
SVCgs.best_score_

In [None]:
accuracy_score(Y_test, SVCgs.predict(X_test))

In [None]:
#plotting the max_df tfidf parameter values over the accuracies of the svc model
max_dfList=[]
for i in list(np.linspace(0.1,1,100)): 
    SVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer(max_df=i)), 
    ('svc',SVC(random_state=0))])
    SVCpipeline.fit(X_train, Y_train)
    max_dfList.append(accuracy_score(Y_test, SVCpipeline.predict(X_test)))

In [None]:
plt.plot(list(np.linspace(0.1,1,100)),max_dfList)
plt.xlabel('TF-IDF max document frequency')
plt.ylabel('SVC accuracy score');

In [None]:
dict(zip(max_dfList,list(np.linspace(0.1,0.5,40))))

In [None]:
SVCpipeline = Pipeline( [(
    'tfidf',TfidfVectorizer(max_df=0.2641025641025641)), 
    ('svc',SVC())])
SVCpipeline.fit(X_train, Y_train)
accuracy_score(Y_test, SVCpipeline.predict(X_test))

## Linear SVC evaluation

In [None]:
#getting the confusion matrix
cf_matrix =confusion_matrix(Y_test, LSVC_Yguess)

In [None]:
print(classification_report(Y_test, LSVC_Yguess))

In [None]:
cf_matrix

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]

group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

ax.set_title('Linear SVC Confusion Matrix, Accuracy Score: %.3f' % accuracy_score(Y_test, LSVC_Yguess));
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print(precision_score(Y_test, LSVC_Yguess, pos_label=1),
      recall_score(Y_test, LSVC_Yguess, pos_label=1))

##  SVC evaluation

In [None]:
cf_matrix =confusion_matrix(Y_test, SVCpipeline.predict(X_test))

In [None]:
print(classification_report(Y_test, SVCpipeline.predict(X_test)))

In [None]:
cf_matrix

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]

group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print("Precision score of SVC:",precision_score(Y_test, SVCpipeline.predict(X_test), pos_label=1),
      "\nRecall score of SVC:",recall_score(Y_test, SVCpipeline.predict(X_test), pos_label=1))

In [None]:
print(classification_report(Y_test, SVCpipeline.predict(X_test)))

# Precision/Recall curves comparison for Linear SVC and SVC

In [None]:
scores = LSVCpipeline.decision_function(X_test)

precs, recs, _ = precision_recall_curve(Y_test, scores, pos_label=1)

plt.figure(figsize=(5,5))
plt.plot(recs, precs)
plt.xlabel('recall')
plt.ylabel('precision')
plt.axis([0, 1, 0, 1]);

In [None]:
pipelines = [LSVCpipeline,SVCpipeline]

APs = []
plt.figure(figsize=(5,5))
for pipeline in pipelines:
    scores = pipeline.decision_function(X_test)
    precisions, recalls, _ = precision_recall_curve(Y_test, scores, pos_label=1)
    
    plt.plot(recalls, precisions)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    
plt.legend(['Linear SVC','SVC']);
plt.title('Precision/Recall curve of Linear SVC and SVC\n' )
plt.axis([0, 1, 0, 1]);

In [None]:
for pipeline in pipelines:
    scores = pipeline.decision_function(X_test)
    precisions, recalls, _ = precision_recall_curve(Y_test, scores, pos_label=1)
    
    plt.plot(recalls, precisions)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    
# leg_labels = [ '{:3}: AP = {:.3f}'.format(n, AP) for n, AP in APs ]
plt.legend(['Linear SVC','SVC']);
plt.title('A zoom-in the Precision/Recall curve of Linear SVC and SVC\n' )
plt.axis([0.8, 1, 0.8, 1]);

# LSVC Misclassified instances sample

In [None]:
tempdf= test 
tempdf['LSVC_predicted'] = LSVC_Yguess
tempdf[tempdf.LSVC_predicted != tempdf.annotation].head().review.values

# SVC Misclassified instances sample

In [None]:
tempdf['SVC_predicted'] =SVCpipline.predict(X_test)
tempdf[tempdf.SVC_predicted != tempdf.annotation].head().review.values

# Features importance

# LSVC

In [None]:
feature_names = LSVCpipeline.named_steps["tfidf"].get_feature_names()

In [None]:
coefs = LSVCpipeline.named_steps["linearsvc"].coef_.flatten()

In [None]:
len(coefs)

In [None]:
zipped = zip(feature_names, coefs)
df = pd.DataFrame(zipped, columns=["feature", "value"])
# Sort the features by the absolute value of their coefficient
df["abs_value"] = df["value"].apply(lambda x: abs(x))
df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
df = df.sort_values("abs_value", ascending=False)

In [None]:
sns.set_style( {"grid.color": ".8", "grid.linestyle": ":"})

fig, ax = plt.subplots(1, 1, figsize=(5, 6))
sns.barplot(y="feature",
            x="value",
            data=df.head(30),
           palette=df.head(30)["colors"])
ax.set_title("Top 30 words (features) contributing to \nthe Linear SVC classification performance", fontsize=12)
ax.set_ylabel("Feature Name", fontsize=12)
ax.set_xlabel("Coefficient", fontsize=12)
plt.grid()