# Pre-processing
Notebook to explore the pre-processing of the data.

In [6]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression

# import nltk # To download
# nltk.download('stopwords') # For stop words
# nltk.download('wordnet') # For POS 
# nltk.download('averaged_perceptron_tagger') # For POS tagging
import re
stop_words = set(stopwords.words('english'))
stop_words.add('')
stop_words.remove('no')
stop_words.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adelliinaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
data = pd.read_csv('../data/stanfordSentimentTreebank/dictionary.txt', sep='|', index_col=1)
train_test_split = pd.read_csv('../data/stanfordSentimentTreebank/sentiment_labels.txt', sep='|', index_col=0)

#data.rename(columns={0: 'phrase_ids', 1: 'phrase'})
data.columns = ['phrase_tokens']
data.index.names = ['phrase_id']
train_test_split.columns = ['sentiment_value']
train_test_split.index.names = ['phrase_id']

           phrase_tokens
phrase_id               
22935                ! '
18235               ! ''
179257            ! Alas
22936        ! Brilliant
40532      ! Brilliant !
           sentiment_value
phrase_id                 
0                  0.50000
1                  0.50000
2                  0.44444
3                  0.50000
4                  0.42708


In [187]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

def pre_processing(collection):
    # Case-fold
    collection = collection.lower()
    
    # Tokenising, converts all non-chars, new-lines and tabs to ' '
    collection = list(filter(None, re.sub(r'[\W]', ' ',collection).split(' ')))
    collection = pos_tag(collection)
    
    # Stemming and Stopping 
    return [lemmatizer.lemmatize(term[0], pos = get_pos_from_tag(term[1])) for term in collection if term[0] not in stop_words] 

#pre_processing("This is a test sentence I love this film it's good. better than the last one, greater than the second one it was playing plays played")


In [116]:
# Apply pre_processing to data

data['phrase_tokens'] = data['phrase_tokens'].apply(pre_processing)

In [117]:
# Remove empty phrases 

for index, row in data.iterrows():
    if len(row[0]) == 0:
        data = data.drop([index], axis=0)

In [118]:
# Inner Join on phrase_id

labelled_phrases = pd.merge(data, train_test_split, left_index=True, right_index=True)
labelled_phrases = labelled_phrases.sort_index(ascending=True)

In [206]:
"""
# UNCOMMENT FOR BINARY CLASIFICATION
conditions = [
    (labelled_phrases['sentiment_value'] >= 0) & (labelled_phrases['sentiment_value'] <= 0.55),
    (labelled_phrases['sentiment_value'] > 0.55) & (labelled_phrases['sentiment_value'] <= 1)]
values = ['negative', 'positive']
labelled_phrases['sentiment'] = np.select(conditions, values)
"""

"\n# UNCOMMENT FOR BINARY CLASIFICATION\nconditions = [\n    (labelled_phrases['sentiment_value'] >= 0) & (labelled_phrases['sentiment_value'] <= 0.55),\n    (labelled_phrases['sentiment_value'] > 0.55) & (labelled_phrases['sentiment_value'] <= 1)]\nvalues = ['negative', 'positive']\nlabelled_phrases['sentiment'] = np.select(conditions, values)\n"

In [207]:
# create a list of our conditions
conditions = [
    (labelled_phrases['sentiment_value'] >= 0) & (labelled_phrases['sentiment_value'] <= 0.2),
    (labelled_phrases['sentiment_value'] > 0.2) & (labelled_phrases['sentiment_value'] <= 0.4),
    (labelled_phrases['sentiment_value'] > 0.4) & (labelled_phrases['sentiment_value'] <= 0.6),
    (labelled_phrases['sentiment_value'] > 0.6) & (labelled_phrases['sentiment_value'] <= 0.8),
    (labelled_phrases['sentiment_value'] > 0.8) & (labelled_phrases['sentiment_value'] <= 1)
    ]

# create a list of the values we want to assign for each condition
values = ['very_negative', 'negative', 'neutral', 'positive', 'very_positive']

# create a new column and use np.select to assign values to it using our lists as arguments
labelled_phrases['sentiment'] = np.select(conditions, values)


In [208]:
labelled_phrases.to_csv('labelled_phrases.csv', index=True)
labelled_phrases.head()

Unnamed: 0_level_0,phrase_tokens,sentiment_value,sentiment
phrase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,[cockettes],0.5,neutral
4,[cockettes],0.42708,neutral
5,"[cockettes, provide, window, subculture, hell,...",0.375,negative
6,"[cockettes, provide, window, subculture, hell,...",0.41667,neutral
7,"[cockettes, provide, window, subculture, hell,...",0.54167,neutral


In [209]:
data = pd.read_csv('labelled_phrases.csv')
data = data.drop(columns=['phrase_id', 'sentiment_value'])

In [210]:
reviews = np.array(data['phrase_tokens'])
sentiments = np.array(data['sentiment'])

# build train and test datasets
train_reviews, test_reviews, train_sentiments, test_sentiments =\
    train_test_split(reviews, sentiments , test_size=0.20,  random_state=0)


In [211]:
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
tv_train_features = tv.fit_transform(train_reviews)


# transform test reviews into features
cv_test_features = cv.transform(test_reviews)
tv_test_features = tv.transform(test_reviews)

print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (189955, 105660)  Test features shape: (47489, 105660)
TFIDF model:> Train features shape: (189955, 105660)  Test features shape: (47489, 105660)


In [216]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
svm = SGDClassifier(loss='hinge', l1_ratio=0.15, max_iter=1000, n_jobs=4, random_state=101)

In [217]:
def display_classification_report(true_labels, predicted_labels, target_names):

    report = metrics.classification_report(y_true=true_labels, y_pred=predicted_labels, target_names=target_names) 
    print(report)
    
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)

def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:  {:2.2%} '.format(metrics.accuracy_score(true_labels, predicted_labels)))
    print('Precision: {:2.2%} '.format(metrics.precision_score(true_labels, predicted_labels, average='weighted')))
    print('Recall:    {:2.2%} '.format(metrics.recall_score(true_labels, predicted_labels, average='weighted')))
    print('F1 Score:  {:2.2%} '.format(metrics.f1_score(true_labels, predicted_labels, average='weighted')))
         
        
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)


def train_predict_model(classifier,  train_features, train_labels,  test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions 


In [218]:
# Logistic Regression model on BOW features
lr_bow_predictions = train_predict_model(classifier=lr, 
                                         train_features=cv_train_features, train_labels=train_sentiments,
                                         test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                  target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance metrics:
------------------------------
Accuracy:  65.78% 
Precision: 64.47% 
Recall:    65.78% 
F1 Score:  64.45% 

Model Classification report:
------------------------------
               precision    recall  f1-score   support

very_negative       0.58      0.46      0.52      8592
     negative       0.71      0.85      0.77     23569
      neutral       0.60      0.51      0.55     10023
     positive       0.52      0.35      0.42      2252
very_positive       0.59      0.42      0.49      3053

     accuracy                           0.66     47489
    macro avg       0.60      0.52      0.55     47489
 weighted avg       0.64      0.66      0.64     47489



In [205]:
# RESULTS FOR BINARY CLASSIFICATION
# Logistic Regression model on BOW features
lr_bow_predictions = train_predict_model(classifier=lr, 
                                         train_features=cv_train_features, train_labels=train_sentiments,
                                         test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                  target_names=['negative', 'positive'])

Model Performance metrics:
------------------------------
Accuracy:  83.62% 
Precision: 83.58% 
Recall:    83.62% 
F1 Score:  83.26% 

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    negative       0.84      0.92      0.88     30021
    positive       0.83      0.70      0.76     17468

    accuracy                           0.84     47489
   macro avg       0.83      0.81      0.82     47489
weighted avg       0.84      0.84      0.83     47489

