In [9]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

# Statistical ML Approach

**Step 1**
- Gathering the data

In [10]:
df=pd.read_csv("IMDB Dataset.csv")

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
df.shape

(50000, 2)

**Changing sentiment into a number**

In [13]:
sentiments_1=df["sentiment"].unique()
sentiments_1

array(['positive', 'negative'], dtype=object)

In [14]:
sentiment_number={}
i=0
for senti in sentiments_1:
    sentiment_number[senti]=i
    i+=1
df["sentiment"]=df["sentiment"].map(sentiment_number)    

**Step 2**
- Text Cleaning

*Checking Lowercase*

In [15]:
df["review"]=df["review"].apply(lambda x:x.lower())

*Checking Punctuation*

In [16]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans("","",string.punctuation))

In [17]:
df["review"]=df["review"].apply(remove_punc)

*Removing Numbers*

In [18]:
def remove_numbers(txt):
    new=""
    for i in txt:
        if not i.isdigit():
            new+=i
    return new 

In [19]:
df["review"]=df["review"].apply(remove_numbers)

*Remove html tags*

In [20]:
import re

def remove_html_tags(txt):
    return re.sub(r'<.*?>', '', txt)

df["review"] = df["review"].apply(remove_html_tags)


*Removing URL/Links*

In [21]:
def remove_urls(txt):
    return re.sub(r'http\S+|www\S+|https\S+', '', txt, flags=re.MULTILINE)
df["review"] = df["review"].apply(remove_urls)

*Removing Emojis and Special Characters*

In [22]:
def remove_emojis(txt):
    new=""
    for i in txt:
        if i.isascii():
            new+=i
    return new
df["review"]=df["review"].apply(remove_emojis)     

*Removing Stopwords*

In [25]:
#Natural Language Toolkit
import nltk

In [26]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [27]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
stop_words=set(stopwords.words("english"))

In [29]:
def remove(txt):
    word=txt.split()
    cleaned=[]
    for i in word:
        if not i in stop_words:
            cleaned.append(i)
    return ' '.join(cleaned)     

In [30]:
df["review"]=df["review"].apply(remove)

In [31]:
df["review"].loc[0]

'one reviewers mentioned watching oz episode youll hooked right exactly happened mebr br first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordbr br called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awaybr br would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered 

# Splitting Data into Training and Testing Sets

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X = df["review"]      
y = df["sentiment"]   

In [34]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Feature Extraction 
**TF-IDF(Term frequency-Inverse Document frequency)**

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vectorizer=TfidfVectorizer()

In [37]:
X_train_tf=vectorizer.fit_transform(X_train)
X_test_tf=vectorizer.transform(X_test)

# Machine Learning Models for Sentiment Classification

**Logistic Regression**

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [39]:
model_LR=LogisticRegression()

In [40]:
model_LR.fit(X_train_tf,y_train)

In [32]:
pred_LR=model_LR.predict(X_test_tf)
print("Logistic Regression Accuracy:", round(accuracy_score(y_test, pred_LR), 3))

Logistic Regression Accuracy: 0.896


**Naive Bayes**

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
model_NB=MultinomialNB()

In [43]:
model_NB.fit(X_train_tf,y_train)

In [44]:
pred_NB=model_NB.predict(X_test_tf)

In [45]:
print("Naive Bayes Accuracy:", round(accuracy_score(y_test, pred_NB), 3))

Naive Bayes Accuracy: 0.869
