## Importing Libraries

### Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

### Pre-processing Libraries

In [2]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Importing Data

In [3]:
## Mounting google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data=sqlite3.connect("/content/drive/MyDrive/data/database.sqlite")

In [5]:
cursor=data.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") #getting table name

<sqlite3.Cursor at 0x7fac3bd096c0>

In [6]:
tables=cursor.fetchall()

In [7]:
for name in tables:
  print(name)

('Reviews',)


In [8]:
raw_data=pd.read_sql_query("SELECT * from Reviews",data)

In [9]:
raw_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Exploring Data

In [10]:
raw_data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [11]:
raw_data.shape

(568454, 10)

In [12]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568454 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568454 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [13]:
raw_data["Score"].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

## Data Preprocessing

In [14]:
req_data=raw_data[["Text","Score"]]

In [15]:
req_data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [16]:
df=req_data[req_data["Score"]!=3] # removing records with neutral review

In [17]:
df["Score"]=df["Score"].apply(lambda x:"Good" if x>3 else "Bad") # splitting reviews into good and bad

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Score"]=df["Score"].apply(lambda x:"Good" if x>3 else "Bad") # splitting reviews into good and bad


In [18]:
df.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,Good
1,Product arrived labeled as Jumbo Salted Peanut...,Bad
2,This is a confection that has been around a fe...,Good
3,If you are looking for the secret ingredient i...,Bad
4,Great taffy at a great price. There was a wid...,Good


In [19]:
df.duplicated().sum()

161973

In [20]:
df.drop_duplicates()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,Good
1,Product arrived labeled as Jumbo Salted Peanut...,Bad
2,This is a confection that has been around a fe...,Good
3,If you are looking for the secret ingredient i...,Bad
4,Great taffy at a great price. There was a wid...,Good
...,...,...
568449,Great for sesame chicken..this is a good if no...,Good
568450,I'm disappointed with the flavor. The chocolat...,Bad
568451,"These stars are small, so you can give 10-15 o...",Good
568452,These are the BEST treats for training and rew...,Good


### Text Preprocessing

In [21]:
for i in range(5):
  print(np.random.choice(df["Text"].values)) # randomly checking few reviews
  print("*"*100)

These are only for people would want a little coffee with their cream and sugar. Will not purchase these again and Sorry I bought them.
****************************************************************************************************
Our golden retriever has so many allergies.  These treats fit the bill.  No additives, no preservatives and he loves them.  What could be better.
****************************************************************************************************
Barilla Plus pasta saved me when my oldest son was a toddler and would eat almost nothing. I searched for something he would eat that was high in protein and fiber. That's when I found this pasta.<br />It's high in protein (egg whites so not for vegans or egg-allergic), high in fiber AND has a bunch of healthy Omegas too!<br /><br />Most importantly--it doesn't TASTE "healthy". A lot of whole wheat pastas cook up mushy or too hard or taste grainy...just not like regular pasta. Barilla Plus doesn't. It really ta

In [22]:
for i in range(5):
  print(np.random.choice(df["Text"].values)) # randomly checking few more reviews
  print("*"*100)

I opened the package as soon as I got home wanting to eat a little something to give me an energy boost so I wouldn't bonk as I headed for the swimming pool right after work. The box had been sitting in 100 degree heat in my mail box for several hours and they where all moist and warm, almost like being fresh from the oven. They where delicious so you might want to try them in the microwave .. would go great with a glass of cold milk, and only 130 calories. Definetly going with me to work for mid afternoon snacks.
****************************************************************************************************
The bars have more peanuts that almonds...maybe it should be called<br />"Roasted Peanut Almond crunch"<br /><br />How do they expect me to eat 36 bars in a month?<br />Amazon should display the expiration date so that people know if they want to buy them or not.
****************************************************************************************************
Good alternati

#### Removing HTML tags

In [23]:
from bs4 import BeautifulSoup

In [24]:
def remove_html_tags(text_inp):
  soup=BeautifulSoup(text_inp,"html.parser")
  plain_txt=soup.get_text()
  return plain_txt


In [25]:
txt=df["Text"].apply(remove_html_tags)

  soup=BeautifulSoup(text_inp,"html.parser")


In [26]:
type(txt)

pandas.core.series.Series

In [27]:
for i in range(10):
  print(np.random.choice(txt))

Having tried alot of the K-Cups now on the market... this is the best yet!!
I have tried several brands of coconut oil, but find Nutiva to be the best.  I use it instead of butter on toast, for low temperature frying, as shortening for baking and is excellent as a skin moisturizer.  By ordering from Amazon sellers I can purchase 54 oz for less than the health food store price of a 29 oz.
I purchased this tea for a neighbor when she had mentioned that she loved the taste. I then went and purchased a pack of 120 (a bit much, I know) but it was totally worth it. The tea has a spicy, yet citrus-y taste that is both refreshing and aromatic. I highly recommend this tea for anyone whom is looking for a new 'everyday' flavor of tea.
Undoubtedly, the purchase and use of stool softeners would plummet if everyone, including physicians, considered magnesium.Fact: An insufficient amount of magnesium in the body is a primary cause of constipation and hemorrhoids, yet most physicians unwittingly reco

#### Removing URL

In [28]:
txt_af_url=txt.str.replace("http\S+|www\S+","",regex=True)

#### Removing non Text characters

In [29]:
txt_af_nontxt=txt_af_url.str.replace("[^\w\S]|_","",regex=True)

#### Handling contractions

In [30]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
import contractions

In [32]:
txt_af_contractions=txt_af_nontxt.apply(lambda x: contractions.fix(x))

#### Removing Numeric Values

In [33]:
txt_af_num=txt_af_contractions.str.replace("\d+","",regex=True)

#### Removing Punctuations

In [34]:
import string

In [35]:
txt_af_punctuation=txt_af_num.str.translate(str.maketrans("","",string.punctuation))

#### Lower Casing

In [36]:
txt_af_lower=txt_af_punctuation.str.lower()

#### Removing Stopwords

In [37]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
sw=stopwords.words("english")

In [39]:
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [40]:
sw.remove("not") # removing "not" from stopword list

In [41]:
sw.remove("no") # removing "no" from stopword list

In [42]:
def remove_stopword(inp):
  li=inp.split()
  sent=[word for word in li if word not in sw]
  return " ".join(sent)

In [43]:
txt_af_sw=txt_af_lower.apply(remove_stopword)

## Data Preparation

In [44]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(txt_af_sw,df["Score"],test_size=0.2,random_state=0)

### Bag of words

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer()

In [46]:
X_train_bow=count_vec.fit_transform(X_train)
X_test_bow=count_vec.transform(X_test)

### TF-IDF

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [48]:
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

## Model Building

In [49]:
from sklearn.metrics import classification_report

### Naive Bayes

In [50]:
from sklearn.naive_bayes import MultinomialNB
bow_mod=MultinomialNB()
tfidf_mod=MultinomialNB()


#### BOW

In [51]:
bow_mod.fit(X_train_bow,y_train)

In [52]:
ypred_bow=bow_mod.predict(X_test_bow)

In [53]:
print(classification_report(y_test,ypred_bow))

              precision    recall  f1-score   support

         Bad       1.00      0.23      0.38     16449
        Good       0.88      1.00      0.93     88714

    accuracy                           0.88    105163
   macro avg       0.94      0.62      0.66    105163
weighted avg       0.89      0.88      0.85    105163



#### TFIDF

In [54]:
tfidf_mod.fit(X_train_tfidf,y_train)

In [55]:
ypred_tfidf=tfidf_mod.predict(X_test_tfidf)

In [56]:
print(classification_report(y_test,ypred_tfidf))

              precision    recall  f1-score   support

         Bad       1.00      0.23      0.38     16449
        Good       0.88      1.00      0.93     88714

    accuracy                           0.88    105163
   macro avg       0.94      0.62      0.66    105163
weighted avg       0.89      0.88      0.85    105163



### Decision Tree

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(max_features=1000)
X_train_bow=count_vec.fit_transform(X_train)
X_test_bow=count_vec.transform(X_test)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=1000)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

In [59]:
from sklearn.tree import DecisionTreeClassifier
dt_bow=DecisionTreeClassifier()
dt_tfidf=DecisionTreeClassifier()

#### BOW

In [60]:
dt_bow.fit(X_train_bow,y_train)
ypred_bow=dt_bow.predict(X_test_bow)
print(classification_report(y_test,ypred_bow))

              precision    recall  f1-score   support

         Bad       0.99      0.03      0.06     16449
        Good       0.85      1.00      0.92     88714

    accuracy                           0.85    105163
   macro avg       0.92      0.51      0.49    105163
weighted avg       0.87      0.85      0.78    105163



#### TFIDF

In [62]:
dt_tfidf.fit(X_train_tfidf,y_train)
ypred_tfidf=dt_tfidf.predict(X_test_tfidf)
print(classification_report(y_test,ypred_tfidf))

              precision    recall  f1-score   support

         Bad       0.99      0.03      0.06     16449
        Good       0.85      1.00      0.92     88714

    accuracy                           0.85    105163
   macro avg       0.92      0.51      0.49    105163
weighted avg       0.87      0.85      0.78    105163

