# **Importing Libraries**



In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("/content/labeledTrainData.tsv",sep='\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
df.shape

(25000, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [6]:
DF=df.copy()

## **Data Preprocessing**

## Removal of Punctuations:

In [7]:
PUNCH_TO_REMOVE=string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('','',PUNCH_TO_REMOVE))

df['review'] = df['review'].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...
2,7759_3,0,The film starts with a manager Nicholas Bell g...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


## Removal of stopwords:

In [8]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
",".join(stopwords.words("english"))

"i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't"

In [10]:
STOP_WORDS=set(stopwords.words("english"))
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in STOP_WORDS])

df['review_without_stopwords']=df['review'].apply(lambda text:remove_stopwords(text))
df.head()


Unnamed: 0,id,sentiment,review,review_without_stopwords
0,5814_8,1,With all this stuff going down at the moment w...,With stuff going moment MJ ive started listeni...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts manager Nicholas Bell giving w...
3,3630_4,0,It must be assumed that those who praised this...,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy wondrously unpretentious 80s e...


In [11]:
#contractions
!pip install contractions



In [12]:
import contractions

In [13]:
def remove_contractions(text):
  return contractions.fix(text)


df['review_without_contractionss']=df['review_without_stopwords'].apply(lambda text:remove_contractions(text))
df.head()


Unnamed: 0,id,sentiment,review,review_without_stopwords,review_without_contractionss
0,5814_8,1,With all this stuff going down at the moment w...,With stuff going moment MJ ive started listeni...,With stuff going moment MJ i have started list...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War Worlds Timothy Hines entertain...,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts manager Nicholas Bell giving w...,The film starts manager Nicholas Bell giving w...
3,3630_4,0,It must be assumed that those who praised this...,It must assumed praised film greatest filmed o...,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy wondrously unpretentious 80s e...,Superbly trashy wondrously unpretentious 80s e...


## Lemmatization

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()
def lemmatize_words(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['review_lemmatized']=df['review_without_contractionss'].apply(lambda text:lemmatize_words(text))
df.head()

Unnamed: 0,id,sentiment,review,review_without_stopwords,review_without_contractionss,review_lemmatized
0,5814_8,1,With all this stuff going down at the moment w...,With stuff going moment MJ ive started listeni...,With stuff going moment MJ i have started list...,With stuff going moment MJ i have started list...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War Worlds Timothy Hines entertain...,The Classic War Worlds Timothy Hines entertain...,The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts manager Nicholas Bell giving w...,The film starts manager Nicholas Bell giving w...,The film start manager Nicholas Bell giving we...
3,3630_4,0,It must be assumed that those who praised this...,It must assumed praised film greatest filmed o...,It must assumed praised film greatest filmed o...,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy wondrously unpretentious 80s e...,Superbly trashy wondrously unpretentious 80s e...,Superbly trashy wondrously unpretentious 80 ex...


# Feature Extraction

In [16]:
df = df.drop("id", axis=1)

In [17]:
df=df[['review_lemmatized','sentiment',]]
df=df.astype(str)


In [18]:
df= df.rename(columns = {'review_lemmatized': 'review'}, inplace = False)
df.head()

Unnamed: 0,review,sentiment
0,With stuff going moment MJ i have started list...,1
1,The Classic War Worlds Timothy Hines entertain...,1
2,The film start manager Nicholas Bell giving we...,0
3,It must assumed praised film greatest filmed o...,0
4,Superbly trashy wondrously unpretentious 80 ex...,1


In [19]:
from sklearn.model_selection import train_test_split
X = df.review
y = df.sentiment
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.6, random_state = 1)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(stop_words = 'english',lowercase=False)
# fit the vectorizer on the training data
vector.fit(X_train)
vector.vocabulary_
X_transformed = vector.transform(X_train)
X_transformed.toarray()
# for test data
X_test_transformed = vector.transform(X_test)

# **Model Building and Evaluation**

# **Naive Bayes Model**

In [21]:
from sklearn.naive_bayes import MultinomialNB
naivebayes = MultinomialNB()
naivebayes.fit(X_transformed, y_train)

# **Evaluate the model's performance**

In [22]:
from sklearn.metrics import classification_report
print(classification_report(naivebayes.predict(X_test_transformed), y_test))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86      5229
           1       0.84      0.87      0.86      4771

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [23]:
review1 = ['	The Classic War Worlds Timothy Hines entertaining film obviously go great effort length faithfully recreate H G Wells classic book Mr Hines succeeds I watched film appreciated fact standard predictable Hollywood fare come every year eg Spielberg version Tom Cruise slightest resemblance book Obviously everyone look different thing movie Those envision amateur critic look criticize everything Others rate movie important baseslike entertained people never agree critic We enjoyed effort Mr Hines put faithful HG Wells classic novel found entertaining This made easy overlook critic perceive shortcoming']
vec = vector.transform(review1).toarray()
print('review:', review1)
print(str(list(naivebayes.predict(vec))[0]).replace('0', 'NEGATIVE').replace('1', 'POSITIVE'))

review: ['\tThe Classic War Worlds Timothy Hines entertaining film obviously go great effort length faithfully recreate H G Wells classic book Mr Hines succeeds I watched film appreciated fact standard predictable Hollywood fare come every year eg Spielberg version Tom Cruise slightest resemblance book Obviously everyone look different thing movie Those envision amateur critic look criticize everything Others rate movie important baseslike entertained people never agree critic We enjoyed effort Mr Hines put faithful HG Wells classic novel found entertaining This made easy overlook critic perceive shortcoming']
POSITIVE


In [24]:
#to save the model
import pickle

saved_model = pickle.dumps(naivebayes)



In [25]:
#load saved model
s = pickle.loads(saved_model)

In [26]:


review2 = ['With stuff going moment MJ i have started listening music watching odd documentary watched The Wiz watched Moonwalker Maybe want get certain insight guy thought really cool eighty maybe make mind whether guilty innocent Moonwalker part biography part feature film remember going see cinema originally released Some subtle message MJs feeling towards press also obvious message drug bad mkaybr br Visually impressive course Michael Jackson unless remotely like MJ anyway going hate find boring Some may call MJ egotist consenting making movie BUT MJ fan would say made fan true really nice himbr br The actual feature film bit finally start 20 minute excluding Smooth Criminal sequence Joe Pesci convincing psychopathic powerful drug lord Why want MJ dead bad beyond Because MJ overheard plan Nah Joe Pescis character ranted wanted people know supplying drug etc do not know maybe hate MJs musicbr br Lots cool thing like MJ turning car robot whole Speed Demon sequence Also director must patience saint came filming kiddy Bad sequence usually director hate working one kid let alone whole bunch performing complex dance scenebr br Bottom line movie people like MJ one level another think people If stay away It try give wholesome message ironically MJs bestest buddy movie girl Michael Jackson truly one talented people ever grace planet guilty Well attention i have gave subjecthmmm well do not know people different behind closed door know fact He either extremely nice stupid guy one sickest liar I hope latter']
vec = vector.transform(review2).toarray()

s.predict(vec)[0]



'1'

# **SVM model**

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [28]:
# Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_features = tfidf_vectorizer.fit_transform(X_train)
test_features = tfidf_vectorizer.transform(X_test)

In [29]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(train_features, y_train)

In [30]:
# Predict on the test set
predictions = svm_classifier.predict(test_features)


In [31]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88      5045
           1       0.88      0.89      0.88      4955

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

