In [51]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle

In [2]:
df = pd.read_csv("balanced_reviews.csv")

# EDA Activities

In [3]:
df.head()

Unnamed: 0,overall,reviewText,summary
0,5,Great nephew loved!!! Very fast delivery!!!!!,Five Stars
1,5,"well made boot, fits perfectly",Five Stars
2,5,Perfect fit. Good quality. (Do not put in dryer.),Nice bra!
3,5,I bought these for my 2 year old girl and she ...,Better than I expected!
4,5,Great backpack. I wanted to get a backpack th...,Great backpack.


In [5]:
df.tail()

Unnamed: 0,overall,reviewText,summary
791995,1,One time use and did not perform well for me.,One Star
791996,1,"I ordered these shoes before, ad they were ama...",ad they were amazing. Cute with everything
791997,1,The size is too large,One Star
791998,1,Has cardboard like feeling while wearing them....,"Feel stiff and cheap, doesn't work with screen..."
791999,1,Its like sticking paper to your boobs. It show...,waste of money


In [6]:
df.shape

(792000, 3)

In [8]:
df.isnull().any(axis=0)

overall       False
reviewText     True
summary        True
dtype: bool

In [10]:
df[df.isnull().any(axis=1)]

Unnamed: 0,overall,reviewText,summary
276,5,,Five Stars
303,5,,Wonderful watch!
530,5,,Five Stars
1277,5,,Five Stars
1916,5,,Five Stars
...,...,...,...
784201,2,"Bracelet buttons are not ""Strong enough""..Keep...",
788806,1,,One Star
790074,1,Insta exact size just not what I expected the ...,
790897,1,Horrible.....I ordered the white and they were...,


##  Handling Missing Values

In [4]:
df.dropna(inplace =True)

In [5]:
df.overall.value_counts()

3    263836
2    131939
1    131876
4    131849
5    131719
Name: overall, dtype: int64

In [6]:
df['overall'] == 3

0         False
1         False
2         False
3         False
4         False
          ...  
791995    False
791996    False
791997    False
791998    False
791999    False
Name: overall, Length: 791219, dtype: bool

In [7]:
df = df[df['overall'] != 3]

In [8]:
df

Unnamed: 0,overall,reviewText,summary
0,5,Great nephew loved!!! Very fast delivery!!!!!,Five Stars
1,5,"well made boot, fits perfectly",Five Stars
2,5,Perfect fit. Good quality. (Do not put in dryer.),Nice bra!
3,5,I bought these for my 2 year old girl and she ...,Better than I expected!
4,5,Great backpack. I wanted to get a backpack th...,Great backpack.
...,...,...,...
791995,1,One time use and did not perform well for me.,One Star
791996,1,"I ordered these shoes before, ad they were ama...",ad they were amazing. Cute with everything
791997,1,The size is too large,One Star
791998,1,Has cardboard like feeling while wearing them....,"Feel stiff and cheap, doesn't work with screen..."


In [11]:
df['Positivity'] = np.where(df['overall'] > 3, 1, 0)

In [12]:
df

Unnamed: 0,overall,reviewText,summary,Positivity
0,5,Great nephew loved!!! Very fast delivery!!!!!,Five Stars,1
1,5,"well made boot, fits perfectly",Five Stars,1
2,5,Perfect fit. Good quality. (Do not put in dryer.),Nice bra!,1
3,5,I bought these for my 2 year old girl and she ...,Better than I expected!,1
4,5,Great backpack. I wanted to get a backpack th...,Great backpack.,1
...,...,...,...,...
791995,1,One time use and did not perform well for me.,One Star,0
791996,1,"I ordered these shoes before, ad they were ama...",ad they were amazing. Cute with everything,0
791997,1,The size is too large,One Star,0
791998,1,Has cardboard like feeling while wearing them....,"Feel stiff and cheap, doesn't work with screen...",0


In [13]:
df.Positivity.value_counts()

0    263815
1    263568
Name: Positivity, dtype: int64

# Data Cleaning

In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
df['reviewText'][0]

'Great nephew loved!!! Very fast delivery!!!!!'

In [22]:
review = re.sub('[^a-zA-Z]',' ', df['reviewText'][0])

In [23]:
review = review.lower()
review = review.split()

In [24]:
review = [word for word in review if not word in set(stopwords.words('english'))]

In [25]:
review

['great', 'nephew', 'loved', 'fast', 'delivery']

In [26]:
ps = PorterStemmer()

In [27]:
review = [ps.stem(word) for word in review ]

In [28]:
review = " ".join(review)

In [29]:
review

'great nephew love fast deliveri'

In [30]:
corpus = []
for i in range(0, 527383):
    review = re.sub('[^a-zA-Z]',' ', df.iloc[i,1])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review ]
    review = " ".join(review)
    corpus.append(review)

In [58]:
corpus

['great nephew love fast deliveri',
 'well made boot fit perfectli',
 'perfect fit good qualiti put dryer',
 'bought year old girl love love comfort love boot made sturdi rubber look realli cute price great',
 'great backpack want get backpack would last sever school year think go meet expect heavi duti pocket thing student need get back forth school',
 'use everi day smell great scent weaken sand bit brush use',
 'nice pant',
 'first pair jazz shoe daughter break fit comfort left blister even first wear danc approx hour time',
 'got shoe elderli mother fit great well made easi velcro closur let snug right keep sure foot walk',
 'love keen newport sandal durabl water proof part shoe ware year also nice come varieti color',
 'comfort',
 'overal bought husband christma one love got earli make sure fit give pair back wrap',
 'ask got order thank',
 'hubbi shirt get littl worn order coupl pretti nice wish pocket price care',
 'noth like real thing real leather',
 'wear birkenstock classic 

# TF_IDF (term frequency-inverse document frequency)

In [40]:
vect = TfidfVectorizer(min_df = 5).fit(corpus)
features = vect.transform(corpus)

In [41]:
labels = df.iloc[:, 3] 
#train test split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state = 42)

In [49]:
model = LogisticRegression()
model.fit(features_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [52]:
predictions = model.predict(features_test)

roc_auc_score(labels_test, predictions)

0.8807043979921422

In [53]:
pkl_filename = "pickle_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [54]:
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [57]:
predictions = pickle_model.predict(features_test)

roc_auc_score(labels_test, predictions)

0.8807043979921422