In [87]:
#LIBRARIES
import random
import pandas as pd
import json
import numpy as np

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

import pickle
import warnings
warnings.filterwarnings("ignore")

#DEFINING CLASSES
class Sentiment:
  NEGATIVE = 'NEGATIVE 🔴'
  POSITIVE = 'POSITIVE 🟢'

class Review:
  def __init__(self, text:str, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score == 0:
      return Sentiment.NEGATIVE
    else:
      return Sentiment.POSITIVE

class ReviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]

  def evenly_distribute(self) -> None:
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
    positive_shrunk = positive[:2500]
    negative_shrunk = negative[:2500]
    self.reviews = negative_shrunk + positive_shrunk
    random.shuffle(self.reviews)

def TestingReviews(test_data, model, vectorizer, amount:int=5) -> list:
  '''
  This function will ingest a list of reviews and run the model to predict the reviewer's sentiment.

  It will take 5 reviews by default, but you can pass any number you want.
  '''
  list_rand = []
  limit = len(test_data)
  rand_row = random.randrange(amount, limit, 1)
  for n in range(amount):
      rand_row = random.randrange(0, limit, 1)
      row_to_test = [test_data.user_review[rand_row]]
      row_test = vectorizer.transform(row_to_test)
      row_prediction = model.predict(row_test)[0]
      row_proba = model.predict_proba(row_test)
      row_max_proba = round(np.max(row_proba) * 100, 2)
      print(f'Review: {row_to_test}')
      print(f'Sentiment: {row_prediction} - Confidence: {row_max_proba}%')

### DEFINING AND PREPARING THE DATA

We will load our train reviews (CSV file) and apply a filter to just use the two columns we need to train our model - user_suggestion and user_review. We will append the values using our Review class.

We will also rename those columns to make it easier to work with them. If needed, we will need to evenly distribute values to avoid bias in our model.

In [17]:
#DATA FOR TRAINING
train_data: pd.DataFrame = pd.read_csv('./data/train_reviews.csv')

#DATA FOR TESTING
test_data: pd.DataFrame = pd.read_csv('./data/test_reviews.csv')

In [24]:
def data_preparing(training_data: pd.DataFrame) -> pd.DataFrame:
    '''
    This function will clean the data and prepare it for training the model.
    '''
    df_training: pd.DataFrame = training_data.rename(columns={"user_suggestion": "score", "user_review": "text"})
    df_training = df_training.filter(['text','score'])
    df_training['score']: int = df_training['score'].replace('Not Recommended', 0).replace('Recommended', 1)

    #CLEANING SPECIAL CHARACTERS, WHICH MAY CONFUSE THE MODEL
    spec_chars = ["!",'"',"#","%","&","'","(",")",
                "*","+",",","-",".","/",":",";","<",
                "=",">","?","@","[","\\","]","^","_",
                "`","{","|","}","~","–"]

    for char in spec_chars:
        df_training['text'] = df_training['text'].str.replace(char, '')

    #DROPPING NULL
    df_training.dropna(inplace=True)
    df_training.isnull().sum()
    return df_training

if __name__ == '__main__':
    df_train: pd.DataFrame = data_preparing(train_data)

#CHECKING IF WE NEED TO BALANCE 0 AND 1 TO TRAIN THE MODEL BETTER
df_train.groupby('score').count() 

In [32]:
# reviews = []

# for index, row in df_train.iterrows():
#   reviews.append(Review(row['text'], row['score']))

reviews = [Review(text, score) for text, score in zip(df_train['text'], df_train['score'])]

### DEFINING THE MODEL AND TRAINING

From previous experience, a support-vector machine model provides the best results for user reviews. However, we will test Decision Trees too to evaluate the output with this dataset.

In [80]:
def distribute_data(reviews:list) -> list:
    training, test = train_test_split(reviews, test_size=0.2, random_state=42)

    train_container = ReviewContainer(training)
    test_container = ReviewContainer(test)

    train_container.evenly_distribute()
    train_x = train_container.get_text()
    train_y = train_container.get_sentiment() 

    test_container.evenly_distribute()
    test_x = test_container.get_text()
    test_y = test_container.get_sentiment()

    print(f'X Training Data Length: {len(train_x)}')
    print(f'Y Training Data Length: {len(train_y)}')

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = distribute_data(reviews)

X Training Data Length: 5000
Y Training Data Length: 5000


#### VECTORISING THE DATA

We will use a Count Vectorizer and a Tfidf Vectorizer and evaluate which delivers the best prediction

In [36]:
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

#### SVM MODEL

In [64]:
%%time
clf_svm = svm.SVC(kernel='linear', probability=True)
clf_svm.fit(train_x_vectors, train_y)

CPU times: user 1min 4s, sys: 799 ms, total: 1min 5s
Wall time: 1min 6s


#### DECISION TREE MODEL

In [38]:
%%time
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

CPU times: user 2.01 s, sys: 25.6 ms, total: 2.03 s
Wall time: 2.1 s


#### MEAN ACCURACY OF EACH MODEL & F1 SCORE

In [61]:
print(f'Mean Accuracy - SVM Model: {clf_svm.score(test_x_vectors, test_y)}')
print(f'Mean Accuracy - Decision Tree Model: {clf_dec.score(test_x_vectors, test_y)}')

print(f'F1 Score - SVM Model: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}')
print(f'F1 Score - Decision Tree Model: {f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}')

Mean Accuracy - SVM Model: 0.8453843955415833
Mean Accuracy - Decision Tree Model: 0.6484709917119177
F1 Score - SVM Model: [0.86293387 0.82268109]
F1 Score - Decision Tree Model: [0.68266254 0.60602178]


### GRID SEARCH
Optimising the model further.

In [53]:
%%time
parameters = {'kernel': ('linear','rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC(probability=True)
clf_svc = GridSearchCV(svc, parameters, cv=5)

clf_svc.fit(train_x_vectors, train_y)

CPU times: user 45min 44s, sys: 25.2 s, total: 46min 9s
Wall time: 47min 7s


In [54]:
%%time
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

clf_gridtree = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf_gridtree.fit(train_x_vectors, train_y)

CPU times: user 3min 13s, sys: 2.71 s, total: 3min 15s
Wall time: 3min 23s


In [55]:
print(f'Mean Accuracy - SVC Model: {clf_svc.score(test_x_vectors, test_y)}')
print(f'Mean Accuracy - Grid Search Model: {clf_gridtree.score(test_x_vectors, test_y)}')

Mean Accuracy - SVC Model: 0.850242926550443
Mean Accuracy - Grid Search Model: 0.6250357244927122


### FURTHER TESTING

You have to options to test the model: input your own text (a review) or runt the TestingReviews function and it will randomly pick X amount of indenpent reviews (not part of the training model) to categorise.

In [71]:
#SINGLE TESTING A SHORT REVIEW
test_set = ['I was a bit confused about the gameplay so I did not like this one, even though the sequel was great']

new_test = vectorizer.transform(test_set)

prediction = str(clf_svc.predict(new_test)[0])
proba = clf_svc.predict_proba(new_test)
max_proba = round(np.max(proba) * 100, 2)

print(f'Sentiment: {prediction.title()} - Confidence: {max_proba}%')

Sentiment: Positive 🟢 - Confidence: 71.6%


In [90]:
#RUN THIS FUCTION (REVIEWS TO BE TESTED ARE 5 BY DEFAULT) TO TRY THE MODEL WITH A DIFFERENT REVIEWS DATASET
chosen_model = clf_svc
chosen_vectorizer = vectorizer

if __name__ == '__main__':
    TestingReviews(test_data, chosen_model, chosen_vectorizer, 10)

Review: ['Fun game with many classes and customisations, a lot of grinding but the battle system is kinda funny so it keeps you motivated.The game has a lot of problems but for my pov it`s not that bad. I am a solo player so I don`t care about the trading restrictions and I am playing on the EU server so no laggs either^^Don`t listen to all the haters, I get their points but all in all it`s a funny game and I would totally recommend it.']
Sentiment: NEGATIVE 🔴 - Confidence: 56.66%
Review: ['Very Fun MMO. For any Lord of the Rings fan, I recommend this title. While it says it is free to play, if you are looking to really get into the game, expect to fork over the money.']
Sentiment: POSITIVE 🟢 - Confidence: 74.38%
Review: ["This game may be slightly pay to win, but it does have lots of potential. the game seems a little grindy, however it isn't too bad, and i found that it doesn't take too much time to get some of the more expensive units.This game may have quite a few buggs and glithce

### SAVING THE MODEL

We will be saving the SVM model as it is 13% more accurate than the Decision Tree model. This process will avoid us going through the calculations again in the future if we want to categorise more reviews.

In [83]:
filename = './data/finalized_model.sav'

#SAVING THE MODEL AND THE VECTORIZER
def exporting_model(model, vectorizer) -> None:
    '''
    You can export the Model you prefer with this function, so you don't have to build the Model again, which can take some time.
    '''
    pickle.dump(model, open(filename, 'wb'))
    pickle.dump(vectorizer, open('count_vect', 'wb'))

if __name__ == '__main__':
    exporting_model(clf_svc, vectorizer)