In [1]:
#LIBRARIES
import random
import pandas as pd
import json

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

import pickle
import warnings
warnings.filterwarnings("ignore")

#DEFINING CLASSES
class Sentiment:
  NEGATIVE = 'NEGATIVE'
  POSITIVE = 'POSITIVE'

class Review:
  def __init__(self,text,score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score == 0:
      return Sentiment.NEGATIVE
    else:
      return Sentiment.POSITIVE

class ReviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]

  def evenly_distribute(self):
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
    positive_shrunk = positive[:2500]
    negative_shrunk = negative[:2500]
    self.reviews = negative_shrunk + positive_shrunk
    random.shuffle(self.reviews)
    
def TestingReviews(amount=5):
    list_rand = []
    limit = len(test_data)
    rand_row = random.randrange(amount, limit, 1)
    for n in range(amount):
        rand_row = random.randrange(0, limit, 1)
        row_to_test = [test_data.user_review[rand_row]]
        row_test = vectorizer.transform(row_to_test)
        print(row_to_test)
        print(clf_svc.predict(row_test))

### DEFINING AND PREPARING THE DATA

We will load our train reviews (CSV file) and apply a filter to just use the two columns we need to train our model - user_suggestion and user_review. We will append the values using our Review class.

We will also rename those columns to make it easier to work with them. If needed, we will need to evenly distribute values to avoid bias in our model.

In [2]:
df = pd.read_csv('./data/train_reviews.csv')

#TEST DATA FOR FURTHER TESTING
test_data = pd.read_csv('./data/test_reviews.csv')

In [3]:
df_train = df.rename(columns={"recommendation": "score", "review": "text"})
df_train = df_train.filter(['text','score'])
df_train['score'] = df_train['score'].replace('Not Recommended',0).replace('Recommended',1)

In [4]:
#CLEANING SPECIAL CHARACTERS, WHICH MAY CONFUSE THE MODEL
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]

for char in spec_chars:
    df_train['text'] = df_train['text'].str.replace(char, '')

In [5]:
df_train.dropna(inplace=True)
df_train.isnull().sum()

text     0
score    0
dtype: int64

In [6]:
reviews = []

for index, row in df_train.iterrows():
  reviews.append(Review(row['text'], row['score']))

In [7]:
df_train.groupby('score').count() #MIGHT NEED TO BALANCE 0 AND 1 TO TRAIN THE MODEL BETTER

Unnamed: 0_level_0,text
score,Unnamed: 1_level_1
0,130624
1,302751


### DEFINING THE MODEL AND TRAINING

From previous experience, a support-vector machine model provides the best results for user reviews. However, we will test Decision Trees too to evaluate the output with this dataset.

In [8]:
training, test = train_test_split(reviews, test_size=0.2, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [9]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment() 

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(len(train_x))
print(len(train_y))

5000
5000


#### VECTORISING THE DATA

We will use a Count Vectorizer and a Tfidf Vectorizer and evaluate which delivers the best prediction

In [10]:
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

#### SVM MODEL

In [11]:
%%time
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

Wall time: 3.84 s


SVC(kernel='linear')

#### DECISION TREE MODEL

In [12]:
%%time
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

Wall time: 971 ms


DecisionTreeClassifier()

#### NAIVE BAYES

In [13]:
# clf_nb = GaussianNB()
# clf_nb.fit(train_x_vectors.todense(), train_y)

#### MEAN ACCURACY OF EACH MODEL & F1 SCORE

In [13]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
#print(clf_nb.score(test_x_vectors.todense(), test_y))

0.8278
0.714


In [14]:
print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
#print(f1_score(test_y, clf_nb.predict(test_x_vectors.todense()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.82797203 0.82762763]
[0.7080441  0.71971776]


### GRID SEARCH
Optimising the model further.

In [15]:
%%time
parameters = {'kernel': ('linear','rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf_svc = GridSearchCV(svc, parameters, cv=5)

clf_svc.fit(train_x_vectors, train_y)

Wall time: 3min 29s


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [17]:
print(clf_svc.score(test_x_vectors, test_y))

0.827


In [18]:
%%time
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

clf_gridtree = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf_gridtree.fit(train_x_vectors, train_y)

Wall time: 3min 29s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150]})

In [19]:
print(clf_gridtree.score(test_x_vectors, test_y))

0.7225


### FURTHER TESTING

You have to options to test the model: input your own text (a review) or runt the TestingReviews function and it will randomly pick X amount of indenpent reviews (not part of the training model) to categorise.

In [16]:
#SINGLE TESTING A SHORT REVIEW
test_set = ['I was a bit confused about the gameplay so I didnt enjoyed the game this time, even though the sequel was great']

new_test = vectorizer.transform(test_set)

clf_svc.predict(new_test)

array(['POSITIVE'], dtype='<U8')

In [30]:
#RUN THIS FUCTION (REVIEWS TO BE TESTED ARE 5 BY DEFAULT) TO TRY THE MODEL WITH A DIFFERENT REVIEWS DATASET
TestingReviews(3)

['Early Access ReviewOpen the game crash, open it again works well.Jump into a game try to land, see a car, land on the car, fall into the ground.After you rage you come back to the game, land, see an AR try to take it immediately get two tapped by R380.New case, buy them all, get sh*t, repeat when another come out.11/10 IGN will suicide again']
['POSITIVE']
['So i found this game on my recommendation tab.I thought "Oh another Weaboo game, let\'s try it"First 10 minutes playing : Too slow, need some hacks-----Hacks Initiated------Stats : MaxedJobs : MaxedMultiplier : 1.00xParents : 10000% Ashamed-------Auto Click Online-----Passion : MaxedMoney : High but not maxedGifts: Can\'t afford a shoe being an astronautHands: Free as can beResult : I got dishonred, kicked out and expelled from my building. All that in less than 3 hours.10/10 Would do everything again.']
['POSITIVE']
["Thank god i didn't waste money with this game.Too many technical issues that makes this game unplayable, literal

### SAVING THE MODEL

We will be saving the SVM model as it is 13% more accurate than the Decision Tree model. This process will avoid us going through the calculations again in the future if we want to categorise more reviews.

In [18]:
filename = './data/finalized_model.sav'

#SAVING THE MODEL AND THE VECTORIZER
pickle.dump(clf_svc, open(filename, 'wb'))
pickle.dump(vectorizer, open('count_vect', 'wb'))