<a href="https://colab.research.google.com/github/ArthAgrawal/NLP_Concepts/blob/main/Movie_Reviews_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code implements a Word2Vec model from scratch on a custom dataset from Kaggle to classify movie reviews as negative or positive. Also, the model is deployed online using gradio

In [9]:
import os
import zipfile

extract_dir = '/content/Movie_Reviews'

if os.path.exists(extract_dir):
    os.system(f'rm -rf {extract_dir}')
os.makedirs(extract_dir, exist_ok=True)


In [12]:
try:
    with zipfile.ZipFile('/content/Movie_Reviews.zip', 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        print("Extraction successful using zipfile.")
except zipfile.BadZipFile:
    print("Error: File is not a valid zip file.")
except Exception as e:
    print(f"An error occurred: {e}")


Extraction successful using zipfile.


In [17]:
import pandas as pd
reviews = pd.read_csv('/content/Movie_Reviews/IMDB Dataset.csv', names=["message", "label"])

In [18]:
reviews.head()

Unnamed: 0,message,label
0,review,sentiment
1,One of the other reviewers has mentioned that ...,positive
2,A wonderful little production. <br /><br />The...,positive
3,I thought this was a wonderful way to spend ti...,positive
4,Basically there's a family where a little boy ...,negative


In [19]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [28]:
from nltk.corpus import stopwords
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', reviews['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [29]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [31]:
corpus[1]

'one reviewer mentioned watching oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experi

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [33]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [None]:
words

In [35]:
import gensim
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [None]:
model.wv.index_to_key

In [37]:
model.corpus_count

50001

In [39]:
from tqdm import tqdm

total_examples = model.corpus_count
epochs = 7

# Using tqdm as a wrapper around the loop
for _ in tqdm(range(epochs), desc='Training'):
    model.train(words, total_examples=total_examples, epochs=1)


Training: 100%|██████████| 7/7 [01:02<00:00,  8.86s/it]


In [40]:
model.wv.similar_by_word('good')

[('decent', 0.7148751020431519),
 ('great', 0.6938554048538208),
 ('bad', 0.691465437412262),
 ('okay', 0.6423685550689697),
 ('nice', 0.6283279657363892),
 ('alright', 0.5993855595588684),
 ('excellent', 0.5859102606773376),
 ('ok', 0.5846730470657349),
 ('fine', 0.5829504728317261),
 ('cool', 0.5540934205055237)]

In [41]:
model.wv.similar_by_word('horrible')

[('terrible', 0.8801614046096802),
 ('awful', 0.8272002339363098),
 ('horrendous', 0.7815074920654297),
 ('horrid', 0.7556914687156677),
 ('atrocious', 0.7403457760810852),
 ('bad', 0.7238848805427551),
 ('dreadful', 0.7117467522621155),
 ('lousy', 0.7095235586166382),
 ('pathetic', 0.6784472465515137),
 ('abysmal', 0.6733323931694031)]

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Converting the preprocessed text data into word vectors
X = [' '.join(words) for words in words]
y = reviews['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a CountVectorizer to convert text into a matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Training the Multinomial NB classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.8602139786021398


In [49]:
new_text = input()
new_text = re.sub('[^a-zA-Z]', ' ', new_text)
new_text = new_text.lower()
new_text = new_text.split()
new_text = [lemmatizer.lemmatize(word) for word in new_text if not word in stopwords.words('english')]
new_text = ' '.join(new_text)

# Converting the preprocessed text data into word vectors
new_text_words = simple_preprocess(new_text)
new_text_vec = []
for word in new_text_words:
    if word in model.wv.index_to_key:
        new_text_vec.append(model.wv[word])

# Transforming word vectors into a format suitable for the Multinomial NB classifier
new_text_vec = vectorizer.transform([' '.join(new_text_words)])


new_text_pred = clf.predict(new_text_vec)

print(f"Predicted label for the new review: {new_text_pred[0]}")


This movie was perfectly paced and suitqble to all genertions
Predicted label for the new review: positive


In [51]:
model.save('word2vec_model.model')

In [None]:
import joblib

joblib.dump(clf, 'multinomial_nb_model.pkl')

In [None]:
# pip install gradio

In [None]:
from joblib import load

# Loading the trained Multinomial Naive Bayes model
clf = load('multinomial_nb_model.pkl')


In [58]:
import re
import nltk
import gensim
import numpy as np
import gradio as gr

# Loading the trained Word2Vec model
model = gensim.models.Word2Vec.load('word2vec_model.model')

# Loading the trained Multinomial Naive Bayes model
clf = load('multinomial_nb_model.pkl')

def preprocess_text(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    return review

# Prediction function
def predict_sentiment(text):
    # Preprocessing the text
    processed_text = preprocess_text(text)

    # Converting the processed text into word vectors
    words = simple_preprocess(processed_text)
    text_vec = []
    for word in words:
        if word in model.wv.index_to_key:
            text_vec.append(model.wv[word])

    # Transforming word vectors into a format suitable for the Multinomial NB classifier
    text_vec = vectorizer.transform([' '.join(words)])

    # Making prediction
    prediction = clf.predict(text_vec)

    return prediction[0]

# Creating a Gradio interface
iface = gr.Interface(fn=predict_sentiment, inputs="text", outputs="text")

# Running the Gradio interface
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://479903bf3633cf35cb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


