In [None]:
from gensim.models import KeyedVectors

###Load vectors directly from the Google file 
embeddingWords = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
###Import all classes needed
import re
import pandas as pd
import seaborn as sn
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import word_tokenize
from gensim import models
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

###Import the training data from file
df_train = pd.read_csv('TrainingTwitterFinal20K.csv')
df_train.dropna(axis=0, inplace=True)

###Method to clean each sentence
def clean(text):
    
    if type(text) != str or text == '':
        return ''
     
    text = re.sub("\'s", "is", text)
    text = re.sub(" whats ", "what is", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have", text)
    text = re.sub("can\'t", "cannot", text, flags=re.IGNORECASE)
    text = re.sub("don\'t", "do not", text, flags=re.IGNORECASE)
    text = re.sub("what\'s", "what is", text, flags=re.IGNORECASE)
    text = re.sub("aren\'t", "are not", text, flags=re.IGNORECASE)
    text = re.sub("it\'s", "it is", text, flags=re.IGNORECASE)
    text = re.sub("dont", "do not", text, flags=re.IGNORECASE)
    text = re.sub("how\'s'", "how is", text, flags=re.IGNORECASE)
    text = re.sub("isn\'t", "is not", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are", text, flags=re.IGNORECASE)
    text = re.sub("\'m", " am", text, flags=re.IGNORECASE)
    text = re.sub("jrk", "jerk", text, flags=re.IGNORECASE)
    text = re.sub("shoulda", "should have", text, flags=re.IGNORECASE)
    text = re.sub(r'\W+', ' URL ', text)
    
    ###remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    
    ###remove stopwords
    text_file = open("list.txt", "r")
    lines = text_file.readlines()
    lines = re.sub('"', '', lines[0])
    lines = re.sub(' ', '', lines)
    lines = re.sub('\n', '', lines)
    
    stopWords2 = lines.split(',')
    stopWords = set(stopwords.words('english'))
    
    ###stemmer
    stemmer = EnglishStemmer()

    words = word_tokenize(text)
    wordsFiltered = []
      
    lemmatizer = WordNetLemmatizer() 
  
    for w in words:
        if w not in stopWords:
            if w not in stopWords2:        
                wordsFiltered.append(lemmatizer.lemmatize(w))
        

    ###Return a list of words
    return sorted(list(set(wordsFiltered)))

In [None]:
###Apply clean to the train data
df_train['comment'] = df_train['comment'].apply(clean)

In [None]:
###sum the vectors given a certain size
def sumvector(vec1, vec2 ,size):
    
    if vec1 is None:
        vec1 = [0] * size
    
    if vec2 is None:
        vec2 = [0] * size

    vecsum = []
   
    for i in range(len(vec1)):
        vecsum.append(vec1[i] + vec2[i])

    return vecsum

In [None]:
###create the vectors of each sentence with vectorizer
allSentences = []

for i, sentence in enumerate(df_train['comment']):
    string = ' '.join(sentence)
    allSentences.append(string)
    
vectorizer = CountVectorizer()
sentenceVectors = vectorizer.fit_transform(allSentences)
counter = 0

In [None]:
###label the training set data
labels = df_train['positivity'].values.tolist()

tfidf = TfidfTransformer()
train = tfidf.fit_transform(sentenceVectors)
train.shape

###create a Multinomial Naive Bayes Classifier, input array for X values and labels s
clf = MultinomialNB().fit(train, labels)

In [None]:
###Retrieving the test data
df_test = pd.read_csv('2004RCnew.csv')

###Apply clean
df_test['comment'] = df_test['comment'].apply(clean)

###Compute labels for the test data
labels = df_test['positivity'].values
 
allSentences2 = []

for i, sentence in enumerate(df_test['comment']):
    allSentences2.append(' '.join(sentence))
    
sentenceVectors2 = vectorizer.transform(allSentences2)

###calculate accuracy of the classifier
predicted = clf.predict(sentenceVectors2)

print(np.mean(predicted == labels))

In [None]:
###Method to print a complete confusion matrix
def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'
    ###Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    
    classes = ('Negative', 'Positive')
    
    ###We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    ###Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    ###Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
###Plot non-normalized confusion matrix
plot_confusion_matrix(labels, predicted, title='Confusion matrix, without normalization')

plt.show()