# Sentiment Analysis Model To Classify IMDB Movie Reviews

In [1]:
import pandas as pd


In [2]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 86% 22.0M/25.7M [00:00<00:00, 67.4MB/s]
100% 25.7M/25.7M [00:00<00:00, 70.1MB/s]


In [5]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


## Exploratory Data Analysis (EDA)

In [18]:
# Load the DataFrame using pandas.
df_imdb = pd.read_csv('IMDB Dataset.csv')
df = df_imdb.iloc[:len(df_imdb) // 50]

In [19]:
# Check the DataFrame’s first lines, its size, and the column types.
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
df.shape

(1000, 2)

In [21]:
df.dtypes

review       object
sentiment    object
dtype: object

In [22]:
# Check for NaN values and delete them if they exist.
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [23]:
# Print the first 5 reviews and their sentiments classification.
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [24]:
# Create a function to count the number of words in each review.
def count_words(text):
    words = text.split()
    return len(words)

# Apply the function to the 'review' column and create a new column 'words count'
df['words_count'] = df['review'].apply(count_words)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['words_count'] = df['review'].apply(count_words)


Unnamed: 0,review,sentiment,words_count
0,One of the other reviewers has mentioned that ...,positive,307
1,A wonderful little production. <br /><br />The...,positive,162
2,I thought this was a wonderful way to spend ti...,positive,166
3,Basically there's a family where a little boy ...,negative,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230


## Preprocessing

In [25]:
import re
import nltk
import spacy
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
def simple_preprocessing(text):
    # Make the text lowercase
    text = text.lower()

    # Remove HTML br tags
    text = re.sub(r'<br\s*\/?>', ' ', text)

    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove hashtags and @ symbols
    text = re.sub(r'[@#]', '', text)

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text using spaCy
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stopwords using nltk
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Return the preprocessed text as a string
    return ' '.join(tokens)


In [27]:
# Apply the preprocessing function to the 'review' column using .loc
df['review_preprocessed'] = df['review'].apply(lambda x: simple_preprocessing(x))

# Display the updated DataFrame
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review_preprocessed'] = df['review'].apply(lambda x: simple_preprocessing(x))


Unnamed: 0,review,sentiment,words_count,review_preprocessed
0,One of the other reviewers has mentioned that ...,positive,307,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,162,wonderful little production filming techniq...
2,I thought this was a wonderful way to spend ti...,positive,166,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,138,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230,petter matteis love time money visually stunni...


In [29]:
# Check if there are duplicated reviews. If so, delete them and check that they were deleted.
df = df.drop_duplicates()
df.duplicated().sum()

0

In [30]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def stemming(text):
    # Tokenize the text using nltk
    words = word_tokenize(text)

    # Initialize Porter Stemmer
    porter = PorterStemmer()

    # Apply stemming to each word
    stemmed_words = [porter.stem(word) for word in words]

    # Return the stemmed text as a string
    return ' '.join(stemmed_words)

# Apply the stemming function to the 'review' column
df['review_stemmed'] = df['review'].apply(stemming)

# Display the updated DataFrame
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,review,sentiment,words_count,review_preprocessed,review_stemmed
0,One of the other reviewers has mentioned that ...,positive,307,one reviewers mentioned watching 1 oz episode ...,one of the other review ha mention that after ...
1,A wonderful little production. <br /><br />The...,positive,162,wonderful little production filming techniq...,a wonder littl product . < br / > < br / > the...
2,I thought this was a wonderful way to spend ti...,positive,166,thought wonderful way spend time hot summer we...,i thought thi wa a wonder way to spend time on...
3,Basically there's a family where a little boy ...,negative,138,basically family little boy jake thinks zombie...,basic there 's a famili where a littl boy ( ja...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230,petter matteis love time money visually stunni...,petter mattei 's `` love in the time of money ...


## Preparing Data To Train The Model

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
label_binarizer = LabelBinarizer()
df['sentiment'] = label_binarizer.fit_transform(df['sentiment'])

# Split the data into features (X) and target variable (Y)
X = df['review_preprocessed']  # Assuming you have a 'review_preprocessed' column from the previous preprocessing
Y = df['sentiment']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Display the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (700,)
X_test shape: (300,)
Y_train shape: (700,)
Y_test shape: (300,)


In [33]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shapes of the TF-IDF matrices
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)

X_train_tfidf shape: (700, 5000)
X_test_tfidf shape: (300, 5000)


## Machine Learning Model: Instantiating, Training, Predicting And Evalueting

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [35]:
# Instantiate the Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, Y_train)

# Predict on the test set
Y_pred = logreg_model.predict(X_test_tfidf)

In [36]:
# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(Y_test, Y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8033333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       161
           1       0.76      0.84      0.80       139

    accuracy                           0.80       300
   macro avg       0.80      0.81      0.80       300
weighted avg       0.81      0.80      0.80       300

Confusion Matrix:
[[124  37]
 [ 22 117]]


## Predicting A New Review

In [37]:
# Reviews to predict
reviews_to_predict = ["I loved this movie!", "This movie was a bad comedy movie!"]

In [38]:
# Preprocess the reviews
preprocessed_reviews = [simple_preprocessing(review) for review in reviews_to_predict]

# Vectorize the preprocessed reviews using the TF-IDF vectorizer
reviews_tfidf = tfidf_vectorizer.transform(preprocessed_reviews)

# Predict the sentiment
predicted_sentiments = logreg_model.predict(reviews_tfidf)

# Map the binary sentiment labels back to their original form
predicted_sentiments = label_binarizer.inverse_transform(predicted_sentiments)

# Display the predictions
for review, sentiment in zip(reviews_to_predict, predicted_sentiments):
    print(f"Review: '{review}'\nPredicted Sentiment: {sentiment}\n")

Review: 'I loved this movie!'
Predicted Sentiment: positive

Review: 'This movie was a bad comedy movie!'
Predicted Sentiment: negative



## Bonus:
Predict different phrases, using preprocessing and more complex sentences. What is the prediction? It still accurate? Improve the accuracy.

In [43]:
additional_phrases = [
    "I can't believe I wasted my time on this film.",
    "A masterpiece of cinematography and storytelling.",
    "Not worth the money, would not recommend.",
    "That was a great shit."
]

In [44]:
# Preprocess the additional phrases
preprocessed_phrases = [simple_preprocessing(phrase) for phrase in additional_phrases]

# Vectorize the preprocessed phrases using the TF-IDF vectorizer
phrases_tfidf = tfidf_vectorizer.transform(preprocessed_phrases)

# Predict the sentiment
predicted_sentiments_additional = logreg_model.predict(phrases_tfidf)

# Map the binary sentiment labels back to their original form
predicted_sentiments_additional = label_binarizer.inverse_transform(predicted_sentiments_additional)

# Display the predictions for additional phrases
for phrase, sentiment in zip(additional_phrases, predicted_sentiments_additional):
    print(f"Phrase: '{phrase}'\nPredicted Sentiment: {sentiment}\n")

Phrase: 'I can't believe I wasted my time on this film.'
Predicted Sentiment: negative

Phrase: 'A masterpiece of cinematography and storytelling.'
Predicted Sentiment: positive

Phrase: 'Not worth the money, would not recommend.'
Predicted Sentiment: negative

Phrase: 'That was a great shit.'
Predicted Sentiment: positive

