# Pre-processing
Notebook to explore the pre-processing of the data.

In [32]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
import nltk # To download
import re
from nltk.corpus import stopwords
nltk.download('stopwords') # For stop words
nltk.download('wordnet') # For POS 
nltk.download('averaged_perceptron_tagger') # For POS tagging
stop_words = set(stopwords.words('english'))
stop_words.add('')
stop_words.remove('no')
stop_words.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adelliinaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adelliinaa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adelliinaa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [33]:
data = pd.read_csv('../data/stanfordSentimentTreebank/dictionary.txt', sep='|', index_col=1)
train_test = pd.read_csv('../data/stanfordSentimentTreebank/sentiment_labels.txt', sep='|', index_col=0)

#data.rename(columns={0: 'phrase_ids', 1: 'phrase'})
data.columns = ['phrase_tokens']
data.index.names = ['phrase_id']
train_test.columns = ['sentiment_value']
train_test.index.names = ['phrase_id']


In [34]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

def get_pos_from_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
def pre_processing(collection):
    # Case-fold
    collection = collection.lower()
    
    # Tokenising, converts all non-chars, new-lines and tabs to ' '
    collection = list(filter(None, re.sub(r'[\W]', ' ',collection).split(' ')))
    collection = pos_tag(collection)
    
    # Stemming and Stopping 
    return [lemmatizer.lemmatize(term[0], pos = get_pos_from_tag(term[1])) for term in collection if term[0] not in stop_words] 

#pre_processing("This is a test sentence I love this film it's good. better than the last one, greater than the second one it was playing plays played")


In [35]:
# Apply pre_processing to data

data['phrase_tokens'] = data['phrase_tokens'].apply(pre_processing)

In [36]:
# Remove empty phrases 

for index, row in data.iterrows():
    if len(row[0]) == 0:
        data = data.drop([index], axis=0)

In [37]:
# Inner Join on phrase_id

labelled_phrases = pd.merge(data, train_test, left_index=True, right_index=True)
labelled_phrases = labelled_phrases.sort_index(ascending=True)

In [51]:

#UNCOMMENT FOR BINARY CLASIFICATION
conditions = [
    (labelled_phrases['sentiment_value'] >= 0) & (labelled_phrases['sentiment_value'] <= 0.55),
    (labelled_phrases['sentiment_value'] > 0.55) & (labelled_phrases['sentiment_value'] <= 1)]
values = ['negative', 'positive']
labelled_phrases['sentiment'] = np.select(conditions, values)


In [38]:
# create a list of our conditions
conditions = [
    (labelled_phrases['sentiment_value'] >= 0) & (labelled_phrases['sentiment_value'] <= 0.2),
    (labelled_phrases['sentiment_value'] > 0.2) & (labelled_phrases['sentiment_value'] <= 0.4),
    (labelled_phrases['sentiment_value'] > 0.4) & (labelled_phrases['sentiment_value'] <= 0.6),
    (labelled_phrases['sentiment_value'] > 0.6) & (labelled_phrases['sentiment_value'] <= 0.8),
    (labelled_phrases['sentiment_value'] > 0.8) & (labelled_phrases['sentiment_value'] <= 1)
    ]

# create a list of the values we want to assign for each condition
values = ['very_negative', 'negative', 'neutral', 'positive', 'very_positive']

# create a new column and use np.select to assign values to it using our lists as arguments
labelled_phrases['sentiment'] = np.select(conditions, values)


In [52]:
labelled_phrases.to_csv('labelled_phrases.csv', index=True)
labelled_phrases.head()

Unnamed: 0_level_0,phrase_tokens,sentiment_value,sentiment
phrase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,[cockettes],0.5,negative
4,[cockettes],0.42708,negative
5,"[cockettes, provide, window, subculture, hell,...",0.375,negative
6,"[cockettes, provide, window, subculture, hell,...",0.41667,negative
7,"[cockettes, provide, window, subculture, hell,...",0.54167,negative


In [53]:
data = pd.read_csv('labelled_phrases.csv')
data = data.drop(columns=['phrase_id', 'sentiment_value'])

In [54]:
reviews = np.array(data['phrase_tokens'])
sentiments = np.array(data['sentiment'])

# build train and test datasets
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews, sentiments , test_size=0.20,  random_state=0)


In [55]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
tv_train_features = tv.fit_transform(train_reviews)


# transform test reviews into features
cv_test_features = cv.transform(test_reviews)
tv_test_features = tv.transform(test_reviews)

print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (189955, 105660)  Test features shape: (47489, 105660)
TFIDF model:> Train features shape: (189955, 105660)  Test features shape: (47489, 105660)


In [56]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
svm = SGDClassifier(loss='hinge', l1_ratio=0.15, max_iter=1000, n_jobs=4, random_state=101)

In [43]:
def display_classification_report(true_labels, predicted_labels, target_names):

    report = metrics.classification_report(y_true=true_labels, y_pred=predicted_labels, target_names=target_names) 
    print(report)
    
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)

def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:  {:2.2%} '.format(metrics.accuracy_score(true_labels, predicted_labels)))
    print('Precision: {:2.2%} '.format(metrics.precision_score(true_labels, predicted_labels, average='weighted')))
    print('Recall:    {:2.2%} '.format(metrics.recall_score(true_labels, predicted_labels, average='weighted')))
    print('F1 Score:  {:2.2%} '.format(metrics.f1_score(true_labels, predicted_labels, average='weighted')))
         
        
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)


def train_predict_model(classifier,  train_features, train_labels,  test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions 


In [218]:
# Logistic Regression model on BOW features
lr_bow_predictions = train_predict_model(classifier=lr, 
                                         train_features=cv_train_features, train_labels=train_sentiments,
                                         test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                  target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance metrics:
------------------------------
Accuracy:  65.78% 
Precision: 64.47% 
Recall:    65.78% 
F1 Score:  64.45% 

Model Classification report:
------------------------------
               precision    recall  f1-score   support

very_negative       0.58      0.46      0.52      8592
     negative       0.71      0.85      0.77     23569
      neutral       0.60      0.51      0.55     10023
     positive       0.52      0.35      0.42      2252
very_positive       0.59      0.42      0.49      3053

     accuracy                           0.66     47489
    macro avg       0.60      0.52      0.55     47489
 weighted avg       0.64      0.66      0.64     47489



In [205]:
# RESULTS FOR BINARY CLASSIFICATION
# Logistic Regression model on BOW features
lr_bow_predictions = train_predict_model(classifier=lr, 
                                         train_features=cv_train_features, train_labels=train_sentiments,
                                         test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                  target_names=['negative', 'positive'])

Model Performance metrics:
------------------------------
Accuracy:  83.62% 
Precision: 83.58% 
Recall:    83.62% 
F1 Score:  83.26% 

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    negative       0.84      0.92      0.88     30021
    positive       0.83      0.70      0.76     17468

    accuracy                           0.84     47489
   macro avg       0.83      0.81      0.82     47489
weighted avg       0.84      0.84      0.83     47489



In [50]:
svm_bow_predictions = train_predict_model(classifier=svm, 
                                          train_features=cv_train_features, train_labels=train_sentiments,
                                          test_features=cv_test_features, test_labels=test_sentiments)
print('SVM results with Bow:')
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,
                                 target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])


svm_tfidf_predictions = train_predict_model(classifier=svm, 
                                            train_features=tv_train_features, train_labels=train_sentiments,
                                            test_features=tv_test_features, test_labels=test_sentiments)
print('-'*60)
print('\nSVM results with TF-IDF:')
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,
                                  target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])

SVM results with Bow:
Model Performance metrics:
------------------------------
Accuracy:  61.73% 
Precision: 60.51% 
Recall:    61.73% 
F1 Score:  58.11% 

Model Classification report:
------------------------------
               precision    recall  f1-score   support

very_negative       0.57      0.29      0.39      8592
     negative       0.63      0.91      0.75     23569
      neutral       0.59      0.36      0.44     10023
     positive       0.52      0.28      0.36      2252
very_positive       0.59      0.37      0.46      3053

     accuracy                           0.62     47489
    macro avg       0.58      0.44      0.48     47489
 weighted avg       0.61      0.62      0.58     47489

------------------------------------------------------------

SVM results with TF-IDF:
Model Performance metrics:
------------------------------
Accuracy:  52.73% 
Precision: 52.71% 
Recall:    52.73% 
F1 Score:  41.94% 

Model Classification report:
------------------------------
   

In [57]:
svm_bow_predictions = train_predict_model(classifier=svm, 
                                          train_features=cv_train_features, train_labels=train_sentiments,
                                          test_features=cv_test_features, test_labels=test_sentiments)
print('SVM results with Bow:')
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,
                                 target_names=['negative', 'positive'])


svm_tfidf_predictions = train_predict_model(classifier=svm, 
                                            train_features=tv_train_features, train_labels=train_sentiments,
                                            test_features=tv_test_features, test_labels=test_sentiments)
print('-'*60)
print('\nSVM results with TF-IDF:')
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,
                                  target_names=['negative', 'positive'])

SVM results with Bow:
Model Performance metrics:
------------------------------
Accuracy:  80.56% 
Precision: 81.36% 
Recall:    80.56% 
F1 Score:  79.47% 

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    negative       0.79      0.94      0.86     30021
    positive       0.85      0.57      0.68     17468

    accuracy                           0.81     47489
   macro avg       0.82      0.76      0.77     47489
weighted avg       0.81      0.81      0.79     47489

------------------------------------------------------------

SVM results with TF-IDF:
Model Performance metrics:
------------------------------
Accuracy:  73.44% 
Precision: 77.15% 
Recall:    73.44% 
F1 Score:  69.43% 

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    negative       0.71      0.97      0.82     30021
    positive       0.87      0.33      0.47     17468

    accura