In [73]:
import numpy as np
import pandas as pd
import re
from urllib.parse import urlparse
from textblob import TextBlob
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [44]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [45]:
df.shape

(50000, 2)

In [46]:
df["review"][5689]

'Everyone in a while, Disney makes one of thoes movies that surprises everyone. One that keeps you wondering until the very end. In the tradition of Pirates of the Caribbean, this movie is sure to turn into a ghost, and kill and rape your village. It\'s terrible. If you want a mindless, senseless, predictable "action" movie, go right ahead. I believe that young kids might enjoy this, as they like it when Good ALWAYS wins. But me, I like movies where it\'s a toss up who\'s going to win. This movie never lets the Bad Guys have the upper hand. By the end, when th heroes are left in an "inescapeable" pit, you just KNOW that they can get out. Everything works out perfect for Cage and his friends, he never has to think over a riddle or clue for more than 10 seconds, no matter how complex it is. See this movie if you want to see some impressive set designs, not if you want to see good acting, or a good film. Go watch a superman movie, it would be much shorter, and the kids would like it more.

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [48]:
df.duplicated().sum()

418

In [49]:
df = df.drop_duplicates()

In [50]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Cleaning Dataset


In [53]:
# Lowercase

df['review'] = df['review'].str.lower()
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [54]:
# Remove Whitespaces

df['review'] = df['review'].str.strip()
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [55]:
# Remove HTML Tag

df['review'] = df['review'].str.replace(r'<.*?>','', regex=True)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [56]:
# Code to Remove URL

df['review'] = df['review'].str.replace(r"https?://\S+|www\.\S+",'', regex=True)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [57]:
# Code to check if a review contains an URL

df[df['review'].str.contains(r"https?://\S+|www\.\S+",'', regex=True)]

Unnamed: 0,review,sentiment


In [58]:
# Expanding Abbreviations

def remove_abb(text):

    if not isinstance(text, str) or not text.strip():
        return text
    
    # Unified replacement mapping (add new entries here)
    replacements = {
        # Standard contractions
        r"won't": "will not",
        r"can't": "cannot",
        r"n't": " not",
        r"'re": " are",
        r"'s": " is",
        r"'d": " would",
        r"'ll": " will",
        r"'ve": " have",
        r"'m": " am",
        
        # Special cases and Unicode artifacts
        r"\bhe's\b": "he is",
        r"\bdon\x89Ûªt\b": "do not",
        r"\bDon\x89Ûªt\b": "Do not",
        r"\byou\x89Ûªre\b": "you are",
        r"\bi\x89Ûªm\b": "I am",
        r"\by'all\b": "you all",
        r"\bY'all\b": "You all",
        r"\bain't\b": "am not",
        
        # Mixed case handling (add more as needed)
        r"(?i)\bwhat's\b": "what is",
        r"(?i)\bit's\b": "it is",
        r"(?i)\bthat's\b": "that is",
        r"(?i)\bhere's\b": "here is",
        r"(?i)\bthere's\b": "there is"
    }
    
    # Apply all replacements
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    
    # Post-processing fixes
    text = re.sub(r"\bI am\b", "I am", text)
    text = re.sub(r"\bi am\b", "I am", text)
    text = re.sub(r"\b(cannot)\b", "can not", text)
    
    return text

In [59]:
df['review'] = df['review'].apply(remove_abb)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there is a family where a little boy...
4        petter mattei is "love in the time of money" i...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a catholic taught in parochial elementary...
49998    I am going to have to disagree with the previo...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [60]:
# Spelling Corrections

def spelling_correction(text):
    return TextBlob(text).correct().string

In [None]:
df['review'].apply(spelling_correction)

In [61]:
# Punctuation

def remove_punctuation(text):
    for i in string.punctuation:
        if i in text:
            text = text.replace(i, '')
    
    return text

In [62]:
df['review'] = df['review'].apply(remove_punctuation)
df['review'].head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there is a family where a little boy...
4    petter mattei is love in the time of money is ...
Name: review, dtype: object

## Preprocessing Dataset

In [None]:
# Tokenize

df['tokenized_review'] = df['review'].apply(word_tokenize)
df.head()

Unnamed: 0,review,sentiment,tokenized_review
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there is a family where a little boy...,negative,"[basically, there, is, a, family, where, a, li..."
4,petter mattei is love in the time of money is ...,positive,"[petter, mattei, is, love, in, the, time, of, ..."


In [79]:
# Stop Word Removal


stopwords_set = set(stopwords.words('english')) # Convert to set first

df['tokenized_review'] = df['tokenized_review'].apply(
    lambda x: np.array([w for w in x if w not in stopwords_set])  # Vectorized operation
)

In [80]:
df.head()

Unnamed: 0,review,sentiment,tokenized_review
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production the filming tech...,positive,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there is a family where a little boy...,negative,"[basically, family, little, boy, jake, thinks,..."
4,petter mattei is love in the time of money is ...,positive,"[petter, mattei, love, time, money, visually, ..."


### EDA and Feature Engineering