**Import Basic Libraries**

In [42]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

**Load Dataset**

In [43]:
df = pd.read_csv(r"D:\My Items\Coding\SmartReviewFilter\data\electronics.csv")
df.head()

Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class
0,{'$oid': '5a1324e4741a2384e8c421a3'},ADKVDDYRW56UB,B003MVZ8QU,Charley,"[0, 0]",Can't really say much about this other than th...,5.0,Good CD's,1341446400,"07 5, 2012",Electronics,1.0
1,{'$oid': '5a13259b741a2384e8fb5682'},A2F8IHET2WY0XP,B00BCXF6H6,"Western Realm ""Miranda""","[0, 0]",This case came and it is really awesome. But i...,5.0,Great case and a good deal.,1390176000,"01 20, 2014",Electronics,1.0
2,{'$oid': '5a1324a5741a2384e8b1a7fa'},AYGHD53QKX480,B001QYNGDW,James Wright,"[0, 0]",I used these to change out the OEM fans in myN...,5.0,Quiet and powerful!,1344297600,"08 7, 2012",Electronics,1.0
3,{'$oid': '5a1324b7741a2384e8b71cc1'},AY16GWUDYFPPA,B002HU629E,Ryan Neff,"[0, 0]","Using it on my new MacBook 13"" with Thunderbol...",1.0,"You get what you pay for, performance is terrible",1303776000,"04 26, 2011",Electronics,0.0
4,{'$oid': '5a132538741a2384e8dd1a13'},A3I8H7F0MW5BX0,B005ND23Q8,Candace Meldrum,"[0, 0]",I bought this case for my husband because the ...,1.0,color different than shown.,1361145600,"02 18, 2013",Electronics,0.0


**Dropping Unnecessary Columns**

In [44]:
df = df.drop(columns=['_id','asin','reviewerName','reviewTime','category'])
df

Unnamed: 0,reviewerID,helpful,reviewText,overall,summary,unixReviewTime,class
0,ADKVDDYRW56UB,"[0, 0]",Can't really say much about this other than th...,5.0,Good CD's,1341446400,1.0
1,A2F8IHET2WY0XP,"[0, 0]",This case came and it is really awesome. But i...,5.0,Great case and a good deal.,1390176000,1.0
2,AYGHD53QKX480,"[0, 0]",I used these to change out the OEM fans in myN...,5.0,Quiet and powerful!,1344297600,1.0
3,AY16GWUDYFPPA,"[0, 0]","Using it on my new MacBook 13"" with Thunderbol...",1.0,"You get what you pay for, performance is terrible",1303776000,0.0
4,A3I8H7F0MW5BX0,"[0, 0]",I bought this case for my husband because the ...,1.0,color different than shown.,1361145600,0.0
...,...,...,...,...,...,...,...
99995,A1GWIONPEVAOOC,"[0, 0]",i used this headphone for monitoring tracking ...,5.0,pretty decent,1353196800,1.0
99996,ASXY7UL8PDMU4,"[0, 0]",It's super tough and awesome. It's easy to put...,5.0,Love it,1374451200,1.0
99997,A3LHZQMWBJBSLU,"[1, 1]",This band implies that one size will fit all. ...,2.0,Beware. Not a Universal Fit,1370390400,0.0
99998,A34Z7D2RD0LNQW,"[12, 12]","My 1st GPS unit, so my rating of ""5"" is admitt...",5.0,Garmin Quest Pocket-sized GPS Navigator,1142640000,1.0


**DATA PREPROCESSING**

Converting unix review time to date time format

In [45]:
df['reviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df.drop(columns=['unixReviewTime'], inplace=True)

Lowercase the text

In [46]:
df['reviewText'] = df['reviewText'].astype(str).str.lower()
df['summary'] = df['summary'].astype(str).str.lower()

Handle missing values

In [47]:
df.isna().sum()

reviewerID    0
helpful       0
reviewText    0
overall       0
summary       0
class         0
reviewTime    0
dtype: int64

Combining review text and summary 

In [48]:
df['text'] = df['summary'] + ". " + df['reviewText']
df = df.drop(columns=['reviewText','summary'], axis='columns')
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text
0,ADKVDDYRW56UB,"[0, 0]",5.0,1.0,2012-07-05,good cd's. can't really say much about this ot...
1,A2F8IHET2WY0XP,"[0, 0]",5.0,1.0,2014-01-20,great case and a good deal.. this case came an...
2,AYGHD53QKX480,"[0, 0]",5.0,1.0,2012-08-07,quiet and powerful!. i used these to change ou...
3,AY16GWUDYFPPA,"[0, 0]",1.0,0.0,2011-04-26,"you get what you pay for, performance is terri..."
4,A3I8H7F0MW5BX0,"[0, 0]",1.0,0.0,2013-02-18,color different than shown.. i bought this cas...
...,...,...,...,...,...,...
99995,A1GWIONPEVAOOC,"[0, 0]",5.0,1.0,2012-11-18,pretty decent. i used this headphone for monit...
99996,ASXY7UL8PDMU4,"[0, 0]",5.0,1.0,2013-07-22,love it. it's super tough and awesome. it's ea...
99997,A3LHZQMWBJBSLU,"[1, 1]",2.0,0.0,2013-06-05,beware. not a universal fit. this band implies...
99998,A34Z7D2RD0LNQW,"[12, 12]",5.0,1.0,2006-03-18,garmin quest pocket-sized gps navigator. my 1s...


Handling duplicates

In [49]:
df.drop_duplicates(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text
0,ADKVDDYRW56UB,"[0, 0]",5.0,1.0,2012-07-05,good cd's. can't really say much about this ot...
1,A2F8IHET2WY0XP,"[0, 0]",5.0,1.0,2014-01-20,great case and a good deal.. this case came an...
2,AYGHD53QKX480,"[0, 0]",5.0,1.0,2012-08-07,quiet and powerful!. i used these to change ou...
3,AY16GWUDYFPPA,"[0, 0]",1.0,0.0,2011-04-26,"you get what you pay for, performance is terri..."
4,A3I8H7F0MW5BX0,"[0, 0]",1.0,0.0,2013-02-18,color different than shown.. i bought this cas...
...,...,...,...,...,...,...
99947,A1GWIONPEVAOOC,"[0, 0]",5.0,1.0,2012-11-18,pretty decent. i used this headphone for monit...
99948,ASXY7UL8PDMU4,"[0, 0]",5.0,1.0,2013-07-22,love it. it's super tough and awesome. it's ea...
99949,A3LHZQMWBJBSLU,"[1, 1]",2.0,0.0,2013-06-05,beware. not a universal fit. this band implies...
99950,A34Z7D2RD0LNQW,"[12, 12]",5.0,1.0,2006-03-18,garmin quest pocket-sized gps navigator. my 1s...


Class Balance

In [50]:
df['class'].value_counts()

class
1.0    75867
0.0    24085
Name: count, dtype: int64