In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")


# Loading the data

In [2]:
df = pd.read_csv('amazon_food_reviw.csv')

In [3]:
df['ReviewText']=df['ReviewText'].apply(lambda x: re.sub('<.*?>','',x))

In [4]:
df=df[df.Score!=3]

In [5]:
df.loc[df['Score']<3, 'Score'] = 0
df.loc[df['Score']>3, 'Score'] = 1


In [6]:
df.Score.value_counts()

1    443777
0     82037
Name: Score, dtype: int64

In [7]:
X=df[['ReviewText']]
y=df['Score']

In [8]:
X

Unnamed: 0,ReviewText
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...
...,...
568449,Great for sesame chicken..this is a good if no...
568450,I'm disappointed with the flavor. The chocolat...
568451,"These stars are small, so you can give 10-15 o..."
568452,These are the BEST treats for training and rew...


# Data preprocessing

In [9]:
stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()


In [10]:
stop_words = stopwords.words('english')

In [11]:
def preprocess(raw_text):
    text = str(raw_text).lower()
    tokens = word_tokenize(text)
   
    tokens = [word for word in tokens if word not in stop_words]
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    words_corpus = ' '.join(tokens)
    
    return words_corpus

In [12]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()


In [13]:
temp_df = X['ReviewText'].progress_apply(lambda x: preprocess(x))


100%|█████████████████████████████████████████████████████████████████████████| 525814/525814 [08:59<00:00, 975.11it/s]


In [14]:
X=pd.DataFrame(temp_df)

# splitting the data

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [16]:
X_train.shape,X_test.shape, y_train.shape, y_test.shape

((420651, 1), (105163, 1), (420651,), (105163,))

# Converting Text to Numerical vectors - BOW Representation

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()
vocab.fit(X_train['ReviewText'])

X_train_bow = vocab.transform(X_train['ReviewText'])

In [18]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words: 116101
Type of train features: <class 'scipy.sparse.csr.csr_matrix'>
Shape of input data: (420651, 116101)


In [19]:
X_test_bow = vocab.transform(X_test['ReviewText'])

# Logistic Rgression

In [20]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

LogisticRegression()

In [21]:
y_test_pred = classifier.predict(X_test_bow)

In [22]:
y_test_pred = classifier.predict(X_test_bow)
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.9387332046442189
              precision    recall  f1-score   support

           0       0.84      0.74      0.79     16409
           1       0.95      0.97      0.96     88754

    accuracy                           0.94    105163
   macro avg       0.90      0.86      0.88    105163
weighted avg       0.94      0.94      0.94    105163



# Multinomial Naive Bayes

In [32]:
from sklearn.naive_bayes import MultinomialNB


In [33]:
nb = MultinomialNB()

In [34]:
nb.fit(X_train_bow, y_train)

MultinomialNB()

In [35]:
y_test_pred = nb.predict(X_test_bow)

In [36]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)


0.9125072506489925

# Support Vector Machines: Linear SVC

In [28]:
from sklearn import svm
svc = svm.LinearSVC()

In [29]:
svc.fit(X_train_bow, y_train)

LinearSVC()

In [30]:
y_test_pred = svc.predict(X_test_bow)

In [31]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)


0.9385715508306153

In [1]:
# considering linear svc with 0.93 accuracy