In [None]:
import pandas as pd
import numpy as np
import nltk
import re

In [None]:
data=pd.read_csv("amazon_reviews.csv")

In [None]:
data.head()

In [None]:
#df=pd.read_csv('data.tsv', sep='\t', header=0, error_bad_lines=False,nrows=100000)

In [None]:
####Selecting only required columns
data=data[['review_body','star_rating']]

In [None]:
data=data.dropna()
data = data.reset_index(drop=True)
data

In [None]:
#The reviews with star rating 4,5 are labelled as positive reviews and 1,2 are labelled as negative reviews. 
#Remove the reviews with star rating 3 as they are considered as neutral.

In [None]:
data['star_rating']=data['star_rating'].astype(int) #convert the star_rating column to int
data=data[data['star_rating']!=3]
data['label']=np.where(data['star_rating']>=4,1,0) #1-Positve,0-Negative

In [None]:
##### Number of reviews by start rating

In [None]:
data['star_rating'].value_counts()

In [None]:
##### Preprocessing

In [None]:
### Lower casing
data['pre_process'] = data['review_body'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))

In [None]:
data.head()

In [None]:
####Remove the HTML tags and URLs from the reviews.

In [None]:
from bs4 import BeautifulSoup
data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
import re
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r'http\S+', '', x))

In [None]:
#Perform the Contractions on the reviews.
#Example: it won’t be converted as it will not be

data.head()

In [None]:
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s
data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))

In [None]:
data.head()

In [None]:
#Remove non-alpha characters

In [None]:
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

In [None]:
data.head(20)

In [None]:
##Remove the extra spaces between the words

In [None]:
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))

In [None]:
#### Remove stopwords

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [None]:
data.head()

In [None]:
#### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [None]:
data.head()

In [None]:
###Split the Data into Training and Testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(data['pre_process'], data['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(data['review_body'], data['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

In [None]:
####Feature extraction
##### TF-IDF vectorizer

In [None]:
print("TFIDF Vectorizer……")
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [None]:
tf_x_train

In [None]:
tf_x_train

In [None]:
##### using SVM algorithm

from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [None]:
###Fitting the Training data into model

In [None]:
clf.fit(tf_x_train,Y_train)

In [None]:
###Predicting the Test data

In [None]:
y_test_pred=clf.predict(tf_x_test)

In [None]:
y_test_pred

In [None]:
#Analyzing the results

In [None]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)

In [None]:
report

In [None]:
###### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000,solver='saga')

In [None]:
clf.fit(tf_x_train,Y_train)

In [None]:
y_test_pred=clf.predict(tf_x_test)

In [None]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)

In [None]:
report