In [1]:
import bz2
import pandas as pd
import numpy as np
import spacy 
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter 
from langdetect import detect
from spacy_langdetect import LanguageDetector
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import *
import scipy as sp

In [2]:
train_file = bz2.BZ2File("train.ft.txt.bz2")

# limited size to 10,000,000 due to performance issues
line_list = train_file.readlines(size=10000000)
lines = [x.decode('utf-8') for x in line_list]

# Split in two: sentiment and review
sentiment = [review.split("__label__")[1][0] for review in lines]
reviews = [review.split("__label__")[1][1:]  for review in lines]
newlist = []

for i in range(len(sentiment)):
    newlist.append([sentiment[i], reviews[i]])
df = pd.DataFrame(newlist, columns = ['score', 'review'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22416 entries, 0 to 22415
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   score   22416 non-null  object
 1   review  22416 non-null  object
dtypes: object(2)
memory usage: 350.4+ KB


In [4]:
df['score'].value_counts()

2    11568
1    10848
Name: score, dtype: int64

In [5]:
# Create a list of punchuation marks
punctuations = string.punctuation

# Create a list of stop words
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer
# parser = English()

# Creating tokenzer function
def spacy_tokenizer(sentence):
    # Creating token object 
    mytokens = nlp(sentence)
    
    # lemmatizing and converting each token in lower case
    mytokens = [ word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations and word.isalpha()]
    
    return mytokens  

In [6]:
# generate preprocessed tokens (lowercase, lemmatize and remove stop-words)
df['tokens'] = df['review'].apply(spacy_tokenizer)

# number of tokens 
df['n_tokens'] = df['tokens'].apply(len)

In [7]:
# generate the language of each tokens
df['language'] = df['review'].apply(detect)
df['language'] = df['language'].apply(lambda x: 1 if x == 'en' else 0)
df['language'].value_counts()

1    22368
0       48
Name: language, dtype: int64

In [8]:
df.head()

Unnamed: 0,score,review,tokens,n_tokens,language
0,2,Stuning even for the non-gamer: This sound tr...,"[stuning, non, gamer, sound, track, beautiful,...",35,1
1,2,The best soundtrack ever to anything.: I'm re...,"[good, soundtrack, read, lot, review, good, ga...",36,1
2,2,Amazing!: This soundtrack is my favorite musi...,"[amazing, soundtrack, favorite, music, time, h...",59,1
3,2,Excellent Soundtrack: I truly like this sound...,"[Excellent, Soundtrack, truly, like, soundtrac...",71,1
4,2,"Remember, Pull Your Jaw Off The Floor After H...","[remember, pull, jaw, Floor, hear, play, game,...",44,1


In [9]:
df['review'][0]

' Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [10]:
print (df['tokens'][0])

['stuning', 'non', 'gamer', 'sound', 'track', 'beautiful', 'paint', 'senery', 'mind', 'recomend', 'people', 'hate', 'vid', 'game', 'music', 'play', 'game', 'Chrono', 'Cross', 'game', 'play', 'good', 'music', 'away', 'crude', 'keyboarding', 'fresh', 'step', 'grate', 'guitar', 'soulful', 'orchestra', 'impress', 'care', 'listen']


In [11]:
# in order to use the generated tokens in the vectorizer instead of using reviews sentenses
# we are creating a dummy function

def dummy_fun(doc):
    return doc

bow_vector = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None, max_features=1000)  

In [12]:
# Create X & y
y = df['score']
X = sp.sparse.hstack((bow_vector.fit_transform(df['tokens']),df[['n_tokens','language']].values))
X_columns=bow_vector.get_feature_names()+df[['n_tokens','language']].columns.tolist()

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
print (X_columns[0:10])

['Aiken', 'Amazon', 'America', 'American', 'Baby', 'Batman', 'Big', 'Book', 'CD', 'Christian']


In [15]:
m = MultinomialNB()

# fit the train data into the model
m.fit(X_train, y_train)

# Predicting with test dataset
y_pred = m.predict(X_test)

#classification report & confusion matrix
print("Confusion Matrix\n",metrics.confusion_matrix(y_test,y_pred))
print("\n")
print("Classification Report\n",metrics.classification_report(y_test,y_pred))
print("\n")
print("Accuracy : ",metrics.accuracy_score(y_test,y_pred)*100)

Confusion Matrix
 [[2475  719]
 [ 610 2921]]


Classification Report
               precision    recall  f1-score   support

           1       0.80      0.77      0.79      3194
           2       0.80      0.83      0.81      3531

    accuracy                           0.80      6725
   macro avg       0.80      0.80      0.80      6725
weighted avg       0.80      0.80      0.80      6725



Accuracy :  80.23791821561338


### Using pipeline

In [16]:
# Create X & y
y = df['score']
X = df['review']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [17]:
grid_param = {'vectorizer__ngram_range' : [(1,1),(1,2),(1,3)]}

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, max_features=1000)

classifier = MultinomialNB()

# Create pipeline 
pipe = Pipeline ([("vectorizer", bow_vector),
                 ("classifier", classifier)])

cv = GridSearchCV(pipe, grid_param, cv=5)

# Model generation
cv.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        CountVectorizer(max_features=1000,
                                                        tokenizer=<function spacy_tokenizer at 0x000000000F0AE5E0>)),
                                       ('classifier', MultinomialNB())]),
             param_grid={'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)]})

In [18]:
print (cv.best_params_)
print (cv.best_score_)

{'vectorizer__ngram_range': (1, 1)}
0.8126948111212565


In [19]:
m = cv.best_estimator_

# Predicting with test dataset
y_pred = m.predict(X_test)

#classification report & confusion matrix
print("Confusion Matrix\n",metrics.confusion_matrix(y_test,y_pred))
print("\n")
print("Classification Report\n",metrics.classification_report(y_test,y_pred))
print("\n")
print("Accuracy : ",metrics.accuracy_score(y_test,y_pred)*100)

Confusion Matrix
 [[2484  710]
 [ 620 2911]]


Classification Report
               precision    recall  f1-score   support

           1       0.80      0.78      0.79      3194
           2       0.80      0.82      0.81      3531

    accuracy                           0.80      6725
   macro avg       0.80      0.80      0.80      6725
weighted avg       0.80      0.80      0.80      6725



Accuracy :  80.22304832713755
