In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import string
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from tqdm.auto import tqdm
from nltk.probability import FreqDist
from nltk import bigrams
import csv
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns

tqdm.pandas()

In [2]:
df = pd.read_csv('/content/Reviews.csv',encoding='utf-8')

In [3]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [4]:
not_needed = ['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time', 'Summary',]
df = df.drop(not_needed, axis =1)


In [5]:
df.dtypes

Score     int64
Text     object
dtype: object

In [6]:
df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [7]:
sample_data = df[:100000]

In [8]:
df.shape

(568454, 2)

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
test = "My husband is a Twizzlers addict.  We've bought these many times from Amazon because we're government employees living overseas and can't get them in the country we are assigned to.  They've always been fresh and tasty, packed well and arrive in a timely manner."

In [34]:
def my_new_tokenizer(text):
    '''This tokenizer is created to remove all the unnecessary values'''
    text = text.lower()
    text = ''.join(char for char in text if char not in string.punctuation)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    ftokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ftokens

In [35]:
#I created a sample data which is relatively very small
scaler = StandardScaler()
X = df['Text']
y = df['Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 96)
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.fit_transform(X_test)


In [36]:
y_test.shape

(113691,)

In [37]:
X_train


189580    This oil is just awesome! You can cook with it...
566157    I love this product, PB2 is great! I love bein...
481254    I bought these jimmies because I couldn't find...
173659    This little gem became a big hit with kids and...
503696    I've had a LOT of raw chocolate bars. Nothing ...
                                ...                        
415348    My cat takes two types of heart medication; on...
2412      I have been drinking Tully's Kona for the past...
337318    I really, really, really like pecans. These ar...
553946    This is a dark, chocolaty cofee. Grind works w...
374868    this drink mix is good but in civilian grocery...
Name: Text, Length: 454763, dtype: object

In [119]:
#X_train['Text'] = X_train['Text'].progress_apply(my_new_tokenizer)

Tokenizing X_train:   0%|          | 0/8000 [00:00<?, ?it/s]

In [121]:
#X_test['Text'] = X_test['Text'].progress_apply(my_new_tokenizer)

Tokenizing X_train:   0%|          | 0/2000 [00:00<?, ?it/s]

In [38]:
def generate_feature_extraction(X_train, X_test):
  '''X_train and X_test is given to generate new features using different techniques'''
  vectorizer_bow = CountVectorizer(tokenizer=my_new_tokenizer, stop_words=stopwords.words('english'), max_features = 200)
  X_train_bow = vectorizer_bow.fit_transform(X_train)
  X_test_bow = vectorizer_bow.transform(X_test)

  vectorizer_tfidf = TfidfVectorizer(tokenizer=my_new_tokenizer, stop_words=stopwords.words('english'), max_features = 200)
  X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
  X_test_tfidf = vectorizer_tfidf.transform(X_test)

  ngram_range = (1, 2)
  vectorizer_ngrams = CountVectorizer(tokenizer=my_new_tokenizer, stop_words=stopwords.words('english'), ngram_range=ngram_range, max_features = 200)
  X_train_ngrams = vectorizer_ngrams.fit_transform(X_train)
  X_test_ngrams = vectorizer_ngrams.transform(X_test)
  return X_train_bow, X_test_bow, X_train_tfidf, X_test_tfidf, X_train_ngrams, X_test_ngrams

In [39]:
X_train_bag, X_test_bag, X_train_tfidf, X_test_tfidf, X_train_ngr, X_test_ngr = generate_feature_extraction(X_train,X_test)



In [40]:
model_list = ['lr','SVM']
def modelling(training_data, testing_data, training_class, testing_class,model_type):
  '''training data would be X_train, testing_data X_test, Training class = y_train, Testingclass = y_test'''
  models = {
      'lr': LogisticRegression(max_iter = 1000),
      'SVM': SVC(max_iter = 700)
  }
  model = models[model_type]
  model.fit(training_data,training_class)
  predictions = model.predict(testing_data)
  conf_matrix = confusion_matrix(testing_class, predictions)
  accuracy = accuracy_score(testing_class, predictions)
  report = classification_report(testing_class, predictions)
  print(conf_matrix)
  print("\n")
  print(accuracy)
  print("\n")
  print(report)


In [41]:
def evaluation(training_data, testing_data, training_class, testing_class, flag):
    #use standardized data
    datatype = ['Bag of Words','Tf-IDF','N-grams']
    model_list = ['lr','SVM']
    for model in model_list:
        print(f'The data used in these models is based on {datatype[flag]}')
        print(f'This is the Confusion Matrix, Accuracy and Classification report of {model}')
        modelling(training_data,testing_data, training_class, testing_class, model)


In [42]:
#These are the values for the standardized dataset
evaluation(X_train_bag, X_test_bag, y_train, y_test,0)

The data used in these models is based on Bag of Words
This is the Confusion Matrix, Accuracy and Classification report of lr
[[ 3053   101   225   173  6873]
 [  984   143   341   289  4211]
 [  784   128   773   818  6088]
 [  465   108   507  2009 13032]
 [ 1239   111   398  1163 69675]]


0.6654264629566105


              precision    recall  f1-score   support

           1       0.47      0.29      0.36     10425
           2       0.24      0.02      0.04      5968
           3       0.34      0.09      0.14      8591
           4       0.45      0.12      0.20     16121
           5       0.70      0.96      0.81     72586

    accuracy                           0.67    113691
   macro avg       0.44      0.30      0.31    113691
weighted avg       0.59      0.67      0.59    113691

The data used in these models is based on Bag of Words
This is the Confusion Matrix, Accuracy and Classification report of SVM




[[ 4410   383   175   208  5249]
 [ 2708   311    95   121  2733]
 [ 4167   343   163   215  3703]
 [ 7192   664   284   436  7545]
 [25151  2021   973  1831 42610]]


0.4215813037091766


              precision    recall  f1-score   support

           1       0.10      0.42      0.16     10425
           2       0.08      0.05      0.06      5968
           3       0.10      0.02      0.03      8591
           4       0.16      0.03      0.05     16121
           5       0.69      0.59      0.63     72586

    accuracy                           0.42    113691
   macro avg       0.23      0.22      0.19    113691
weighted avg       0.48      0.42      0.43    113691



In [43]:
#These are the values for the standardized dataset
evaluation(X_train_tfidf, X_test_tfidf, y_train, y_test,1)

The data used in these models is based on Tf-IDF
This is the Confusion Matrix, Accuracy and Classification report of lr
[[ 3675   127   271   235  6117]
 [ 1113   119   400   332  4004]
 [  878   129   758   917  5909]
 [  557    62   519  2090 12893]
 [ 1772    65   399  1212 69138]]


0.6665435258727603


              precision    recall  f1-score   support

           1       0.46      0.35      0.40     10425
           2       0.24      0.02      0.04      5968
           3       0.32      0.09      0.14      8591
           4       0.44      0.13      0.20     16121
           5       0.71      0.95      0.81     72586

    accuracy                           0.67    113691
   macro avg       0.43      0.31      0.32    113691
weighted avg       0.59      0.67      0.59    113691

The data used in these models is based on Tf-IDF
This is the Confusion Matrix, Accuracy and Classification report of SVM




[[ 5427  2228   730  1169   871]
 [ 2334  1609   567   946   512]
 [ 2650  2083  1168  1800   890]
 [ 3856  3194  2203  4334  2534]
 [18289 12612  7603 15881 18201]]


0.270373204563246


              precision    recall  f1-score   support

           1       0.17      0.52      0.25     10425
           2       0.07      0.27      0.12      5968
           3       0.10      0.14      0.11      8591
           4       0.18      0.27      0.22     16121
           5       0.79      0.25      0.38     72586

    accuracy                           0.27    113691
   macro avg       0.26      0.29      0.22    113691
weighted avg       0.56      0.27      0.31    113691



In [44]:
#These are the values for the standardized dataset
evaluation(X_train_ngr, X_test_ngr, y_train, y_test,2)

The data used in these models is based on N-grams
This is the Confusion Matrix, Accuracy and Classification report of lr
[[ 3073   103   239   166  6844]
 [  991   150   338   278  4211]
 [  784   120   783   807  6097]
 [  452   105   529  2009 13026]
 [ 1232   108   400  1166 69680]]


0.6657958853383293


              precision    recall  f1-score   support

           1       0.47      0.29      0.36     10425
           2       0.26      0.03      0.05      5968
           3       0.34      0.09      0.14      8591
           4       0.45      0.12      0.20     16121
           5       0.70      0.96      0.81     72586

    accuracy                           0.67    113691
   macro avg       0.44      0.30      0.31    113691
weighted avg       0.59      0.67      0.59    113691

The data used in these models is based on N-grams
This is the Confusion Matrix, Accuracy and Classification report of SVM




[[ 4370   349   215   483  5008]
 [ 2724   197   133   284  2630]
 [ 4172   260   216   355  3588]
 [ 7188   422   415   772  7324]
 [24758  1727  1489  3099 41513]]


0.4139993491129465


              precision    recall  f1-score   support

           1       0.10      0.42      0.16     10425
           2       0.07      0.03      0.04      5968
           3       0.09      0.03      0.04      8591
           4       0.15      0.05      0.07     16121
           5       0.69      0.57      0.63     72586

    accuracy                           0.41    113691
   macro avg       0.22      0.22      0.19    113691
weighted avg       0.48      0.41      0.43    113691

