In [8]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import joblib
import warnings

In [9]:
# warnings.filterwarnings("ignore") #Use during rapid prototyping but avoid it in any other case
stop = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

df=pd.read_csv("data.tsv", sep="\t", header=0, on_bad_lines="skip", low_memory = False)
df.head(3)
df=df[["review_body","star_rating"]]
df=df.dropna()
df = df.reset_index(drop=True)

df['star_rating']=df['star_rating'].astype(int)  #convert the star_rating column to int
df=df[df["star_rating"]!=3]
df['label']=np.where(df['star_rating']>=4, 1, 0) #1-Positve,0-Negative

df = df.sample(frac=1).reset_index(drop=True)    #shuffle
data = pd.concat([df[df['label']==0][:50000], df[df['label']==1][:50000]])
data = data.reset_index(drop=True)

In [10]:
def contractions(s):
 s = re.sub(r"won't", "will not",s)
 s = re.sub(r"would't", "would not",s)
 s = re.sub(r"could't", "could not",s)
 s = re.sub(r"\'d", " would",s)
 s = re.sub(r"can\'t", "can not",s)
 s = re.sub(r"n\'t", " not", s)
 s = re.sub(r"\'re", " are", s)
 s = re.sub(r"\'s", " is", s)
 s = re.sub(r"\'ll", " will", s)
 s = re.sub(r"\'t", " not", s)
 s = re.sub(r"\'ve", " have", s)
 s = re.sub(r"\'m", " am", s)
 return s

def purifier(x):
    x = " ".join(x.lower() for x in str(x).split())
    # x = BeautifulSoup(x, features="html.parser").get_text() #To be used for removing HTML Tags and Text Extraction
    # x = re.sub(r"http\S+", "", x)                           #Remove URLs from reviews
    x = contractions(x)
    x = " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)])
    x = re.sub(" +", " ", x)
    x = " ".join([x for x in x.split() if x not in stop])
    x = " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)])
    return x

data["pre_process"] = data["review_body"].apply(lambda x: purifier(x))

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(data["pre_process"], data["label"], test_size=0.25, random_state=30)
print("Train: ", X_train.shape, Y_train.shape, "Test: ", (X_test.shape,Y_test.shape))
print("TFIDF Vectorizer……")

vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

clf = LinearSVC(random_state=0)
clf.fit(tf_x_train,Y_train)

y_test_pred = clf.predict(tf_x_test)

print(tf_x_test)
print(X_test)

report = classification_report(Y_test, y_test_pred, output_dict=True)

filename = "review classifier model.joblib"
joblib.dump([vectorizer, clf], filename)

report

Train:  (75000,) (75000,) Test:  ((25000,), (25000,))
TFIDF Vectorizer……
  (0, 23759)	0.15383987447060984
  (0, 22027)	0.20352335359301046
  (0, 21618)	0.2081995001587626
  (0, 17805)	0.18598091686340384
  (0, 16269)	0.2184282785149355
  (0, 15637)	0.17594104187591852
  (0, 14276)	0.22559191092222844
  (0, 14260)	0.18583304469102221
  (0, 13960)	0.0860531233305212
  (0, 13843)	0.11506461876833764
  (0, 13595)	0.08452294594398924
  (0, 11523)	0.28634090538611306
  (0, 10806)	0.15399188557638005
  (0, 10143)	0.1430372248720883
  (0, 10119)	0.1325103042654712
  (0, 9707)	0.29953850175237057
  (0, 9068)	0.1161823744873653
  (0, 8971)	0.2177765409985894
  (0, 8590)	0.17345254191628448
  (0, 7505)	0.15453010045864246
  (0, 7389)	0.1297467354400615
  (0, 7302)	0.17918247484183206
  (0, 7193)	0.2656365174378374
  (0, 4610)	0.18797299607709808
  (0, 761)	0.13705917340676682
  :	:
  (24996, 3954)	0.3051921027386547
  (24997, 24816)	0.30007982820540885
  (24997, 24334)	0.4709617407339057
  (24997

{'0': {'precision': 0.909026240910528,
  'recall': 0.9145197201017812,
  'f1-score': 0.911764705882353,
  'support': 12576},
 '1': {'precision': 0.9129413670229997,
  'recall': 0.9073567289117836,
  'f1-score': 0.9101404811884385,
  'support': 12424},
 'accuracy': 0.91096,
 'macro avg': {'precision': 0.9109838039667638,
  'recall': 0.9109382245067824,
  'f1-score': 0.9109525935353957,
  'support': 25000},
 'weighted avg': {'precision': 0.9109719019833818,
  'recall': 0.91096,
  'f1-score': 0.9109575311784653,
  'support': 25000}}