In [55]:
import pandas as pd
import numpy as np
import nltk
import re

# initializing files

In [56]:
df = pd.read_csv('data.tsv', sep='\t',header=0,on_bad_lines='skip',low_memory=False)
df = df[['review_body','star_rating']]
df = df.dropna()#remove the null values
df = df.reset_index(drop=True)#reset the index

# labeling reviews

In [57]:
df['star_rating'] = df['star_rating'].astype(int)
df = df[df['star_rating'] != 3]
df['label'] = np.where(df['star_rating']>=4,1,0)# 1 = positive, 2 = negative
df['star_rating'].value_counts()

df = df.sample(frac=1).reset_index(drop=True) #shufle

Data = df[df['label'] == 0][:250000]
Data = Data._append(df[df['label'] == 1][:250000])
Data = Data.reset_index(drop=True)
display(Data['label'].value_counts())
Data

label
0    250000
1    250000
Name: count, dtype: int64

Unnamed: 0,review_body,star_rating,label
0,"Like it when it first came, but it started to ...",2,0
1,Not a damn thing like the picture it looks mor...,1,0
2,Ugly,1,0
3,I returned these earrings because they are fli...,1,0
4,This watch is nothing like the picture. I am v...,2,0
...,...,...,...
499995,Absolutely Beautiful!! Worth the money. Looks ...,5,1
499996,Nothing bad to say about this pendant/charm. I...,5,1
499997,Beautiful much nicer in person. Great seller,5,1
499998,My boyfriend bought me this ring for Christmas...,5,1


# pre-processing

In [58]:
Data['pre_process'] = Data['review_body'].apply(lambda x: " ".join(x.lower() for x in str(x).split())) #convert all the reviews into lower case
from bs4 import BeautifulSoup#remove the html tags and urls from the review
Data['pre_process']=Data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
import re
Data['pre_process']=Data['pre_process'].apply(lambda x: re.sub(r"http\S+", "", x))

def contractions(s):#perform the contractions on the reviews
 s = re.sub(r"won't", "will not",s)
 s = re.sub(r"would't", "would not",s)
 s = re.sub(r"could't", "could not",s)
 s = re.sub(r"\'d", " would",s)
 s = re.sub(r"can\'t", "can not",s)
 s = re.sub(r"n\'t", " not", s)
 s= re.sub(r"\'re", " are", s)
 s = re.sub(r"\'s", " is", s)
 s = re.sub(r"\'ll", " will", s)
 s = re.sub(r"\'t", " not", s)
 s = re.sub(r"\'ve", " have", s)
 s = re.sub(r"\'m", " am", s)
 return s
Data['pre_process']=Data['pre_process'].apply(lambda x:contractions(x))
#remove non-alpha characters
Data['pre_process']=Data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

Data['pre_process']=Data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))#remove extra spaces between the words

from nltk.corpus import stopwords#remove the stopwords
stop = stopwords.words('english')
Data['pre_process']=Data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
Data['pre_process']=Data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))


  Data['pre_process']=Data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
  Data['pre_process']=Data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())


In [64]:
Data

Unnamed: 0,review_body,star_rating,label,pre_process
0,"Like it when it first came, but it started to ...",2,0,like first came started fall apart day would g...
1,Not a damn thing like the picture it looks mor...,1,0,damn thing like picture look like swan
2,Ugly,1,0,ugly
3,I returned these earrings because they are fli...,1,0,returned earring flimsy calling black stone di...
4,This watch is nothing like the picture. I am v...,2,0,watch nothing like picture disappointed child ...
...,...,...,...,...
499995,Absolutely Beautiful!! Worth the money. Looks ...,5,1,absolutely beautiful worth money look like pic...
499996,Nothing bad to say about this pendant/charm. I...,5,1,nothing bad say pendantcharm wear everyday aro...
499997,Beautiful much nicer in person. Great seller,5,1,beautiful much nicer person great seller
499998,My boyfriend bought me this ring for Christmas...,5,1,boyfriend bought ring christmas love diamond l...


# feature extraction  tfidf

In [59]:
from sklearn.model_selection import train_test_split#spliting data into test and train
X_train,X_test,Y_train, Y_test = train_test_split(Data['pre_process'], Data['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

print("TFIDF Vectorizer……")#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

Train:  (375000,) (375000,) Test:  ((125000,), (125000,))
TFIDF Vectorizer……


# training the model

In [60]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

clf.fit(tf_x_train,Y_train)#fitting the training data into the model
y_test_pred = clf.predict(tf_x_test)#predicting the test data

from sklearn.metrics import classification_report#analyzing the result
report=classification_report(Y_test, y_test_pred,output_dict=True)

In [61]:
report

{'0': {'precision': 0.9160804816223067,
  'recall': 0.9231297295571378,
  'f1-score': 0.9195915965584693,
  'support': 62638},
 '1': {'precision': 0.9221881060116355,
  'recall': 0.9150604534812867,
  'f1-score': 0.9186104537917935,
  'support': 62362},
 'accuracy': 0.919104,
 'macro avg': {'precision': 0.9191342938169711,
  'recall': 0.9190950915192122,
  'f1-score': 0.9191010251751315,
  'support': 125000},
 'weighted avg': {'precision': 0.9191275509996453,
  'recall': 0.919104,
  'f1-score': 0.9191021083567458,
  'support': 125000}}

In [71]:
clf.predict(pd.DataFrame([["good"]],columns=['pre_process']))



ValueError: could not convert string to float: 'good'