#Import and clean data

##Importing the data

In [1]:
!pip install -U spacy==3.*
!python -m spacy download en_core_web_sm
!python -m spacy info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==3.*
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.5.0
2023-02-27 02:18:11.503323: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library

In [74]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [75]:
df = pd.read_csv('Reviews.csv')

In [76]:
df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [77]:
#On 10000 rows, count vectorizer gives f1 of 0.15
#On 10000 rows, count vectorizer gives f1 of 0.16
#One reason for above small f1 might be target data is imbalanced
#After removing target 5 which creates bias, F1 score on initial testing set: 0.46145917730330815 
# F1 score on initial testing set on Random Forest: 0.5051871671334535
# F1 score on initial testing set on Random Forest: 0.5051871671334535
# F1 score on initial testing set on Decision Tree: 0.46145917730330815
df = df[:100000]

##Feature Processing

-> Remove all the characters

-> Keep or remove stop words? We'll see

In [78]:
features = df['Text']
targets = df['Score']

In [79]:
new_features = []
new_targets = []

for i in range(0, features.shape[0]):
  if targets[i] != 5:
    new_features.append(features[i])
    new_targets.append(targets[i])

In [80]:
new_targets_df = pd.DataFrame(new_targets)
new_targets_df.value_counts()

4    14643
1     9318
3     8059
2     5568
dtype: int64

In [81]:
x_train, x_test, y_train, y_test = train_test_split(new_features, new_targets, random_state=1)

In [82]:
nlp = spacy.load('en_core_web_sm')

In [83]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [84]:
unwanted_pipes = ['ner', 'parser']

# Further remove stop words and take the lemma instead of token text.
def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            ##Try with and without punctuation, as punctuation holds some sentiment/emotion value
            not t.is_punct and \ 
            not t.is_space and \
            not t.is_stop and \
            t.is_alpha]

In [85]:
%%time
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
train_feature_vects = vectorizer.fit_transform(x_train)

CPU times: user 4min 3s, sys: 670 ms, total: 4min 4s
Wall time: 4min 6s


##Decision Trees

In [91]:
from sklearn.tree import DecisionTreeClassifier
# Instantiate a classifier with the default settings.
nb_classifier = DecisionTreeClassifier()
nb_classifier.fit(train_feature_vects, y_train)
nb_classifier.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [92]:
train_preds = nb_classifier.predict(train_feature_vects)
print('F1 score on initial training set: {}'.format(metrics.f1_score(y_train, train_preds, average='macro')))

F1 score on initial training set: 0.9997451781115644


In [93]:
test_feature_vects = vectorizer.transform(x_test)

In [94]:
test_preds = nb_classifier.predict(test_feature_vects)
print('F1 score on initial testing set: {}'.format(metrics.f1_score(y_test, test_preds, average='macro')))

F1 score on initial testing set: 0.46145917730330815


##Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

nb_classifier = RandomForestClassifier()
nb_classifier.fit(train_feature_vects, y_train)
nb_classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [96]:
train_preds = nb_classifier.predict(train_feature_vects)
print('F1 score on initial training set: {}'.format(metrics.f1_score(y_train, train_preds, average='macro')))

F1 score on initial training set: 0.9997451569111642


In [97]:
test_preds = nb_classifier.predict(test_feature_vects)
print('F1 score on initial testing set: {}'.format(metrics.f1_score(y_test, test_preds, average='macro')))

F1 score on initial testing set: 0.5051871671334535
