# NLP Sentiment Analysis

### Make imports and load data

In [1]:
import numpy as np
import pandas as pd
import random
import spacy

from imblearn.over_sampling import RandomOverSampler
from joblib import dump
from nltk.corpus import stopwords
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from spacy.util import minibatch

In [2]:
raw_data = pd.read_csv('data/all-data-v2.csv', sep=';', encoding='latin-1')

In [3]:
data = raw_data[raw_data['sentiment']=='positive'].sample(175)
data = data.append(raw_data[raw_data['sentiment']=='negative'].sample(35), ignore_index=True)
data = data.append(raw_data[raw_data['sentiment']=='neutral'].sample(70), ignore_index=True)

def remove_quot_mark(string):
    if string[0] == ' ':
        string = string[1:]
    if string[0] == '"':
        string = string[1:]
    if string[-1] =='"':
        string = string[:-1]
    return string
        
data['sentence'] = data['sentence'].apply(remove_quot_mark)
data.index = np.arange(len(data))

In [4]:
val_counts = data['sentiment'].value_counts()

print('Number of positive sentences: ' + str(val_counts['positive']) + '\n' 
      + 'Number of negative sentences: ' + str(val_counts['negative']) + '\n' 
      + 'Number of neutral sentences: ' + str(val_counts['neutral']))

Number of positive sentences: 175
Number of negative sentences: 35
Number of neutral sentences: 70


**Data is unbalanced! Give more weight to negative and neutral sentences (e.g. through resampling)**

## Create column with labels (1: positive, 0: neutral, -1: negative)

In [5]:
data['label'] = data.apply(lambda row: int(row['sentiment']=='positive') - int(row['sentiment']=='negative'), axis=1)
data.head()

Unnamed: 0,sentiment,sentence,label
0,positive,The long-standing partnership and commitment e...,1
1,positive,During the past decade it has gradually divest...,1
2,positive,The company also estimates the already carried...,1
3,positive,"In the second quarter of 2010, the group 's pr...",1
4,positive,"The OMX Nordic 40 ( OMXN40 ) index, comprising...",1


## Split data into features and labels

In [6]:
X_train = data['sentence']
y_train = data['label']

## Define (auxiliary) functions for models based on "Bag of Words"

**label_to_dict(label):**
* The function *label_to_dict* takes as input labels in the set $\{-1, 0, 1\}$ and returns them in a nested dictionary with outer key 'cats' (categories).

* Example:
    * Input: 1
    * Output: {'cats': {'1': True, '-1': False, '0': False}}
    
**transform_bow(X_train, y_train):**
* The function *transform_bow* generates data through resampling from the underrepresented classes (negative and neutral) and returns a list of tupels (X_train, y_train).

**train_bow(X_train, y_train, architecture):**
* The function *train_bow* trains a model based on the "Bag of Words" representation of the training data (X_train, y_train) and returns the trained model.
* The parameter architecture can be chosen from the set $\{$'bow', 'simple_cnn', 'ensemble'$\}$.

**validate_bow(X_train, X_test, y_train, y_test, architecture):** 
* The function *validate_bow* trains the model train_bow(X_train, y_train, architecture) and returns a confusion matrix based on the test data (X_test, y_test).

In [7]:
def label_to_dict(label):
    return {'cats': {'1': label == 1,
                    '-1': label == -1,
                    '0': label == 0}}

def transform_bow(X_train, y_train):
    
    X_train_2d = pd.concat([X_train,pd.Series(np.zeros(len(X_train)),
                                              index=X_train.index)], axis=1)
    ros = RandomOverSampler()
    X_train, y_train = ros.fit_resample(X_train_2d, y_train)
    X_train = X_train['sentence']
        
    y_train_dict = y_train.apply(lambda label: label_to_dict(label))
    train_data = list(zip(X_train, y_train_dict))
    return train_data

def train_bow(X_train, y_train, architecture):
    model = spacy.blank('en')
        
    train_data = transform_bow(X_train, y_train)

    textcat = model.create_pipe("textcat", config={"exclusive_classes": True, 
                                                   "architecture": architecture})
    model.add_pipe(textcat)
    textcat.add_label('1')
    textcat.add_label('-1')
    textcat.add_label('0')
    optimizer = model.begin_training()
    losses = {}
    
    for epoch in range(10):
        random.shuffle(train_data)
        batches = minibatch(train_data, size=8)

        for batch in batches:
            texts, labels = zip(*batch)
            model.update(texts, labels, sgd = optimizer, losses=losses)
    
    return model
    
def validate_bow(X_train, X_test, y_train, y_test, architecture):
    
    model = train_bow(X_train, y_train, architecture)
    test_docs = [model.tokenizer(text) for text in X_test]
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(test_docs)
    predicted_labels = [textcat.labels[label] for label in scores.argmax(axis=1)]
    
    return confusion_matrix(y_test.apply(str),predicted_labels, labels=['1','0','-1'])

## Define (auxiliary) functions for models based on TF-IDF

**transform_tfidf_train(X_train):** 
* The function *transform_tfidf_train* generates the tf-idf representation of the training data X_train and returns this representation and the corresponding TfidfVectorizer.

**transform_tfidf_test(X_test, vectorizer):** 
* The function *transform_tfidf_test* returns the tf-idf representation of the test data X_test based on the vectorizer.

**train_tfidf(model_type, X_train, y_train, param):** 
* The function *train_tfidf* trains a model based on the tf-idf representation of the training data (X_train, y_train) and returns the trained model.
* The parameter *model_type* can be chosen from the set $\{$'RandFor-tfidf', 'Boost-tfidf', 'LogReg'$\}$. 
* The parameter *param* represents the number of estimators ('RandFor-tfidf' and 'Boost-tfidf') or the inverse of the regularization parameter C ('LogReg').

**validate_tfidf(model_type, X_train, X_test, y_train, y_test, param):** 
* The function *validate_tfidf* trains the model train_tfidf(model_type, X_train, y_train, param) and returns a confusion matrix based on the test data (X_test, y_test).

In [8]:
def transform_tfidf_train(X_train):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    X_train = vectorizer.fit_transform(X_train).toarray()
    return X_train, vectorizer

def transform_tfidf_test(X_test, vectorizer):
    X_test = vectorizer.transform(X_test)
    return X_test

def train_tfidf(model_type, X_train, y_train, param):
    if model_type == 'RandFor-tfidf':
        model = RandomForestClassifier(n_estimators = param, class_weight = 'balanced')
    elif model_type == 'LogReg':
        model = LogisticRegression(C = param, class_weight = 'balanced')
    elif model_type == 'Boost-tfidf':
        base_est = DecisionTreeClassifier(max_depth=3)
        model = AdaBoostClassifier(base_estimator=base_est, n_estimators=param)
    
    model.fit(X_train, y_train)
    return model

def validate_tfidf(model_type, X_train, X_test, y_train, y_test, param):
    
    X_train, vectorizer = transform_tfidf_train(X_train)
    X_test = transform_tfidf_test(X_test, vectorizer)
    model = train_tfidf(model_type, X_train, y_train, param)
    
    predicted_labels = model.predict(X_test)
    return confusion_matrix(y_test,predicted_labels, labels=[1,0,-1])

## Define (auxiliary) functions for models based on Word2Vec

**transform_w2v_train(X_train, vectorizer):** 
* The function *transform_w2v_train* generates the Word2Vec representation of the training data X_train based on the vectorizer and returns the representation and the corresponding mean vector.

**transform_w2v_test(X_test, vectorizer, mean_vec):** 
* The function *transform_w2v_test* returns the Word2Vec representation of the test data based on the vectorizers and the mean vector.

**train_w2v(model_type, X_train, y_train, param, vectorizer):** 
* The function *train_w2v* trains a model based on the Word2Vec representation of the training data, i.e. w2v_train(X_train, vectorizer), and returns both the trained model and the mean vector.
* The parameter *model_type* can be chosen from the set $\{$'SVC', 'RidReg', 'Boost-w2v', 'RandFor-w2v'$\}$.
* The parameter *param*  represents the number of estimators ('RandFor-w2v' and 'Boost-w2v') or the regularization parameter C and alpha for 'SVC' and 'LogReg', respectively.

**validate_w2v(model_type, X_train, X_test, y_train, y_test, param):** 
* The function *validate_w2v* trains the model train_w2v(model_type, X_train, y_train, param, vectorizer) and returns a confusion matrix based on the test data (X_test, y_test).

In [9]:
def transform_w2v_train(X_train, vectorizer):
    
    with vectorizer.disable_pipes():
        train_vectors = np.array([vectorizer(text).vector for text in X_train])
        
    mean_vec = train_vectors.mean(axis=0)
    X_train = train_vectors - mean_vec
    return X_train, mean_vec

def transform_w2v_test(X_test, vectorizer, mean_vec):
    
    with vectorizer.disable_pipes():
        test_vectors = np.array([vectorizer(text).vector for text in X_test])

    X_test = test_vectors - mean_vec
    return X_test

def train_w2v(model_type, X_train, y_train, param, vectorizer):
    X_train, mean_vec = transform_w2v_train(X_train, vectorizer)
    
    if model_type == 'SVC':
        model = SVC(C=param, class_weight='balanced')
    elif model_type == 'RidReg':
        model = RidgeClassifier(alpha=param, class_weight='balanced')
    elif model_type == 'Boost-w2v':
        base_est = DecisionTreeClassifier(max_depth=3)
        model = AdaBoostClassifier(base_estimator=base_est, n_estimators=param)
    elif model_type == 'RandFor-w2v':
        model = RandomForestClassifier(n_estimators=param, class_weight='balanced')
    
    model.fit(X_train, y_train)
    return model, mean_vec

def validate_w2v(model_type, X_train, X_test, y_train, y_test, param):
    
    vectorizer = spacy.load('en_core_web_lg')
    model, mean_vec = train_w2v(model_type, X_train, y_train, param, vectorizer)
    X_test = transform_w2v_test(X_test, vectorizer, mean_vec)
    predicted_labels = model.predict(X_test)
    return confusion_matrix(y_test,predicted_labels, labels=[1,0,-1])

## Define the cross-validation procedure

**cross_validation(representation, X_train, y_train, model, kf):** 
* The function *cross_validation* estimates the out-of-sample precision of the *model* based on the *representation*.

In [10]:
def cross_validation(representation, X_train, y_train, model, kf):
    precision = []
    for train_idx, val_idx in kf.split(index_test):
        
        X_train_cv = X_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_train_cv = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]
        if representation == 'bow':
            confusion_mat = validate_bow(X_train_cv, X_val, y_train_cv , y_val, model)
            
        elif representation == 'tfidf':
            confusion_mat = validate_tfidf(model[0], X_train_cv, X_val, y_train_cv , y_val, model[1])
            
        elif representation == 'word2vector':
            confusion_mat = validate_w2v(model[0], X_train_cv, X_val, y_train_cv , y_val, model[1])
            
        precision.append(np.trace(confusion_mat)/np.sum(confusion_mat)) 
    
    return np.mean(precision)

## Choose the model with the lowest estimated out-of-sample error based on the CV

In the following, the model with the lowest (estimated) out-of-sample error, based on a 10-fold cross-validation is determined.

In [12]:
n_folds = 10
index_test = range(len(y_train))
kf = KFold(n_splits=n_folds, shuffle=True)

bow_models = [[architecture] for architecture in ['bow', 'simple_cnn', 'ensemble']]

tfidf_models = [['LogReg', x/10] for x in range(1,11)] + \
    [['Boost-tfidf', x] for x in [50, 100, 150,200]] + \
    [['RandFor-tfidf', x] for x in [50, 100, 150,200]]
    

w2v_models = [['SVC', x] for x in range(1,11)] + \
    [['RidReg', x/10] for x in range(1,11)] + \
    [['Boost-w2v', x] for x in [50, 100, 150,200]] + \
    [['RandFor-w2v', x] for x in [50, 100, 150,200]]

best_model = None
best_precision = 0
for representation in ['bow', 'tfidf', 'word2vector']:

    if representation == 'bow':
        model_list = bow_models
    elif representation == 'tfidf':
        model_list = tfidf_models
    elif representation == 'word2vector':
        model_list = w2v_models
        
    for model in model_list:
        model_precision = cross_validation(representation, X_train, y_train, model, kf)
        print(model + [model_precision])
        if (best_model is None) | (model_precision > best_precision):
            best_model = model
            best_precision = model_precision
                
print(best_model)

['bow', 0.6321428571428572]
['simple_cnn', 0.5892857142857142]
['ensemble', 0.5964285714285714]
['LogReg', 0.1, 0.6]
['LogReg', 0.2, 0.5857142857142856]
['LogReg', 0.3, 0.5821428571428571]
['LogReg', 0.4, 0.6071428571428572]
['LogReg', 0.5, 0.6249999999999999]
['LogReg', 0.6, 0.6607142857142857]
['LogReg', 0.7, 0.6392857142857143]
['LogReg', 0.8, 0.65]
['LogReg', 0.9, 0.6500000000000001]
['LogReg', 1.0, 0.6464285714285714]
['Boost-tfidf', 50, 0.5464285714285715]
['Boost-tfidf', 100, 0.5535714285714286]
['Boost-tfidf', 150, 0.5678571428571428]
['Boost-tfidf', 200, 0.5678571428571428]
['RandFor-tfidf', 50, 0.6142857142857143]
['RandFor-tfidf', 100, 0.6249999999999999]
['RandFor-tfidf', 150, 0.6428571428571429]
['RandFor-tfidf', 200, 0.6285714285714287]
['SVC', 1, 0.642857142857143]
['SVC', 2, 0.642857142857143]
['SVC', 3, 0.6785714285714286]
['SVC', 4, 0.6964285714285714]
['SVC', 5, 0.6785714285714286]
['SVC', 6, 0.6821428571428572]
['SVC', 7, 0.6892857142857143]
['SVC', 8, 0.69642857142

## Train the model with the lowest (estimated) out-of-sample error

In the following, the model with the lowest (estimated) out-of-sample error is trained based on the whole data set. The expected out-of-sample error is (at least theoretically) less than or equal to the out-of-sample error of the model based on less data.

In [13]:
if best_model in bow_models:
    final_model = train_bow(X_train, y_train, best_model[0])
    dump(final_model, 'final_model.joblib') 
elif best_model in tfidf_models:
    X_train, vectorizer = transform_tfidf_train(X_train)
    final_model = train_tfidf(best_model[0], X_train, y_train, best_model[1])
    dump(final_model, 'final_model.joblib') 
    dump(vectorizer, 'vectorizer.joblib') 
elif best_model in w2v_models:
    vectorizer = spacy.load('en_core_web_lg')
    final_model, mean_vec = train_w2v(best_model[0], X_train, y_train, best_model[1], vectorizer)
    dump(final_model, 'final_model.joblib') 
    dump(mean_vec, 'mean_vec.joblib')
    #dump(vectorizer, 'vectorizer.joblib') 

**"In sample" confusion matrix**

In [12]:
y_pred = final_model.predict(transform_w2v_test(X_train, vectorizer, mean_vec))
print(confusion_matrix(y_train,y_pred, labels=[1,0,-1]))

[[175   0   0]
 [  0  70   0]
 [  0   0  35]]
