### Imports ###

In [13]:
import pandas as pd
import json
from nltk.stem import PorterStemmer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import nltk
import string
from nltk.corpus import stopwords
from itertools import compress

import gzip
import gensim
import logging
from sklearn.metrics import accuracy_score

import re
import numpy as np
from functools import reduce

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Transforming Sample Data ###

In [14]:
reviews_sample = pd.read_csv('./Data/reviews_sample.csv')

In [None]:
# feature additions to detect polarity

reviews_sample['num_exclamation'] = reviews_sample['text'].map(lambda x: x.count('!'))
reviews_sample['percent_UPPER'] = reviews_sample['text'].map(lambda x: len(re.findall(r'[A-Z]', x)) / len(x))
reviews_sample['frowny'] = reviews_sample['text'].map(lambda x: len(re.findall(':\(', x)) + len(re.findall(':-\(', x)))

In [24]:
# feature additions to detect positive vs negative

#reviews_sample['len_review'] = reviews_sample['text'].map(lambda x: len(x.split(' ')))
reviews_sample['smiley'] = reviews_sample['text'].map(lambda x: len(re.findall(':\)', x)) + len(re.findall(':-\)', x)))

In [25]:
# categorizing rating as positive or negative

def pos_or_neg(stars):
    if(stars < 4):
        return 'neg'
    else:
        return 'pos'
    
reviews_sample['pos_or_neg'] = reviews_sample['stars'].map(lambda x: pos_or_neg(x)) 

In [26]:
# categorizing rating as strong opinion (1 or 5 stars) or weak

def is_polarized(stars):
    if((stars == 1) | (stars == 5)):
        return 1
    else:
        return 0
   
reviews_sample['polarized'] = reviews_sample['stars'].map(lambda x: is_polarized(x))

In [27]:
ps = PorterStemmer()

processed_text = reviews_sample['text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation))) # remove punctuation
processed_text = processed_text.str.lower() # make all lower case

reviews_sample['tokens'] = processed_text.map(lambda x: nltk.word_tokenize(x)) # tokenize words

In [28]:
reviews_sample['tokens'] = reviews_sample['tokens'].map(lambda x: [ps.stem(y) for y in x]) # stemming words

In [29]:
word2vec_model = gensim.models.Word2Vec(
    reviews_sample['tokens'],
    size = 150,
    window = 10,
    min_count = 10,
    workers = 10,
    iter = 10)

2020-09-02 13:09:26,903 : INFO : collecting all words and their counts
2020-09-02 13:09:26,904 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-02 13:09:27,312 : INFO : collected 22954 word types from a corpus of 1056036 raw words and 10000 sentences
2020-09-02 13:09:27,316 : INFO : Loading a fresh vocabulary
2020-09-02 13:09:27,348 : INFO : effective_min_count=10 retains 4183 unique words (18% of original 22954, drops 18771)
2020-09-02 13:09:27,350 : INFO : effective_min_count=10 leaves 1017549 word corpus (96% of original 1056036, drops 38487)
2020-09-02 13:09:27,385 : INFO : deleting the raw counts dictionary of 22954 items
2020-09-02 13:09:27,390 : INFO : sample=0.001 downsamples 58 most-common words
2020-09-02 13:09:27,396 : INFO : downsampling leaves estimated 741350 word corpus (72.9% of prior 1017549)
2020-09-02 13:09:27,428 : INFO : estimated required memory for 4183 words and 150 dimensions: 7111100 bytes
2020-09-02 13:09:27,431 : INFO : rese

2020-09-02 13:09:33,478 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-09-02 13:09:33,480 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-09-02 13:09:33,481 : INFO : EPOCH - 7 : training on 1056036 raw words (741609 effective words) took 0.8s, 903822 effective words/s
2020-09-02 13:09:34,321 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-09-02 13:09:34,338 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-09-02 13:09:34,342 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-09-02 13:09:34,345 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-09-02 13:09:34,355 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-09-02 13:09:34,362 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-09-02 13:09:34,363 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-09-02 13:09:34,370 : INFO : worker threa

In [30]:
def sentence_embeddings(sentence):
    sum = 0

    for i in sentence:
        if i in word2vec_model.wv:
            sum += word2vec_model[i]
        else:
            pass
    return (sum / len(sentence))

reviews_sample['embeddings'] = reviews_sample['tokens'].map(lambda x: sentence_embeddings(x)) # creating word embeddings

  


In [31]:
reviews_sample_slim = reviews_sample.drop(['text', 'tokens'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(reviews_sample_slim.drop(['stars', 'pos_or_neg'], axis = 1), reviews_sample['pos_or_neg'], test_size=0.33, random_state=42)

def reduce_observation(observation):
    return reduce(np.append, observation)

X_train = X_train.apply(lambda x: reduce_observation(x), axis = 1)
X_test = X_test.apply(lambda x: reduce_observation(x), axis = 1)

In [35]:
model = LinearSVC(random_state=0)
model.fit(X_train.tolist(), y_train)

y_pred = model.predict(X_test.tolist())
accuracy_score(y_test, y_pred)

0.8624242424242424

In [36]:
confusion_matrix(y_test, y_pred)

array([[ 907,  242],
       [ 212, 1939]], dtype=int64)

In [38]:
classified_samples = pd.concat([X_test[y_pred == y_test], y_test[y_pred == y_test]], axis=1)
classified_samples['pred_stars'] = list(compress(y_pred, y_pred==y_test))

classified_samples = classified_samples.rename(columns={'stars': 'actual_stars'})
classified_samples.head()

Unnamed: 0,0,pos_or_neg,pred_stars
6252,"[0.41028234362602234, -0.18730618059635162, 0....",pos,pos
4684,"[-0.074729785323143, -0.24098823964595795, -0....",pos,pos
1731,"[0.09801438450813293, -0.889679491519928, 0.49...",neg,neg
4742,"[0.18766193091869354, -0.517037034034729, 0.17...",neg,neg
4521,"[0.23474937677383423, -0.06934019178152084, -0...",neg,neg


In [40]:
misclassified_samples = pd.concat([X_test[y_pred != y_test], y_test[y_pred != y_test]], axis=1)
misclassified_samples['pred_stars'] = list(compress(y_pred, y_pred!=y_test))

misclassified_samples = misclassified_samples.rename(columns={'stars': 'actual_stars'})
misclassified_samples.head()

Unnamed: 0,0,pos_or_neg,pred_stars
439,"[0.3753794729709625, -0.5895034670829773, -0.0...",pos,neg
582,"[0.06692862510681152, -0.5167813897132874, 0.6...",pos,neg
2249,"[0.4236672818660736, -0.15592585504055023, -0....",neg,pos
9485,"[-0.12759684026241302, 0.05899722874164581, -0...",neg,pos
4947,"[-0.20715832710266113, -0.24915523827075958, -...",pos,neg
