## Importing what we need 

In [1]:
import pickle
import numpy as np
import pandas as pd  
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Download stopwords from nltk if not excist
# nltk.download('stopwords')

## loading the dataset  and clean it

In [2]:
mov = pd.read_csv('IMDB Dataset.csv')

In [3]:
mov.shape

mov.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Checking for missing vaules 

In [4]:
mov.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
mov['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
mov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
# Setting positive values to 1 and negative to 0
mov['sentiment'] = mov['sentiment'].replace({'positive': 1, 'negative': 0})

In [9]:
mov['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

## Defining the stop words to apply stemming

In [10]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Initialize the PorterStemmer for stemming 

In [11]:
# Stemming 
port_stem = PorterStemmer() 

## Define a function to clean , lowercase, and stem the text content 

In [12]:
def stemming(content):
    
    stemmed_content= re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content= stemmed_content.lower()
    stemmed_content= stemmed_content.split()
    stemmed_content= [port_stem.stem(word) for word in stemmed_content if not word in stop_words]
    stemmed_content= ' '.join(stemmed_content)
    
    return stemmed_content

In [13]:
mov['stemmed_content'] = mov['review'].apply(stemming)

In [14]:
mov.shape

(50000, 3)

## separating the data and label

In [15]:
X = mov['stemmed_content'].values
Y = mov['sentiment'].values

In [16]:
print(Y)

[1 1 1 ... 0 0 0]


## Spliting data into training data and test data

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify= Y, random_state= 2)

In [18]:
X_test.shape

(10000,)

## Initialize TF-IDF Vectorizer to converting the text data to numerical data 

In [19]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)  # Fit and transform the training data
X_test = vectorizer.transform(X_test)

## Initialize & Training the Model

In [20]:
model2 = SVC()
model2.fit(X_train, Y_train)

In [17]:
predictions = model2.predict(X_test) 

##  Accuracy Score

In [21]:
accuracy = model2.score(X_test, Y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.893


## Saving the trained model

In [23]:
pickle.dump(vectorizer, open('vector.pkl', 'wb'))
pickle.dump(model2, open('trained_model.pkl', 'wb'))

## Loading saved model

In [24]:
vector_form = pickle.load(open('vector.pkl', 'rb'))
load_model = pickle.load(open('trained_model.pkl', 'rb'))

In [25]:
def sentiment(review):
    review = stemming(review)  # Preprocess the input text with the stemming function
    input_data = [review]  # Wrap in a list as the model expects an array-like input
    vector_form1 = vector_form.transform(input_data)  # Transform input data to TF-IDF vector
    prediction = load_model.predict(vector_form1)  # Make prediction using the loaded model
    return prediction

In [26]:
value = sentiment('One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked')
print(value)

[1]


In [29]:
neg = sentiment('There was piss everywhere, and the lock didn\'t work. Ended up being a very embarrassing and unhygienic poo')
print(neg)

[0]
