In [3]:
%pip install numpy pandas seaborn matplotlib plotly scikit-learn nltk wordcloud pyspellchecker beautifulsoup4

Collecting wordcloud
  Downloading wordcloud-1.9.4-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.4-cp311-cp311-win_amd64.whl (299 kB)
   ---------------------------------------- 0.0/299.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/299.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/299.9 kB ? eta -:--:--
   -- ------------------------------------- 20.5/299.9 kB 108.9 kB/s eta 0:00:03
   ---- ----------------------------------- 30.7/299.9 kB 163.8 kB/s eta 0:00:02
   ---- ----------------------------------- 30.7/299.9 kB 163.8 kB/s eta 0:00:02
   ----- ---------------------------------- 41.0/299.9 kB 151.3 kB/s eta 0:00:02
   ----- ---------------------------------- 41.0/299.9 kB 151.3 kB/s eta 0:00:02
   ------------ --------------------------- 92.2/299.9 kB 261.7 kB/s eta 0:00:01
   ------------ --------------------------- 92.2/299.9 kB 261.7 kB/s eta 0:00:01
   ------------- -------------------------- 102.4/299.9 kB 2

### Loading of Data

In [1]:
import pandas as pd
df = pd.read_csv(r'C:\Users\tiled\OneDrive\Desktop\DataSet\IMDB-Dataset.csv',encoding='latin-1')

In [4]:
df.head(10)["Reviews"]

0    *Disclaimer: I only watched this movie as a co...
1    I am writing this in hopes that this gets put ...
2    Really, I could write a scathing review of thi...
3    If you saw the other previous spoof movies by ...
4    This movie I saw a day early for free and I st...
5    Honestly, what is wrong with you, Hollywood? N...
6    I was given a free ticket to this film; so I c...
7    OK, so "Disastrous" isn't an imaginative barb ...
8    Jason Friedberg and Aaron Seltzer, the way eve...
9    Honestly the worst movie ever made. Theatre fu...
Name: Reviews, dtype: object

### Customizing Stopwords for Enhanced Text Preprocessing

*Enhancing the default stopword list by adding domain-specific words and removing negations to improve text analysis accuracy.*

In [9]:
from nltk.corpus import stopwords
import nltk

stop_words = stopwords.words('english')
nltk.download('averaged_perceptron_tagger')

new_stopwords = [
"would", "shall", "could", "might", "movie", "movies", "film", "films",
"cinema", "director", "directors", "actor", "actors", "actress", "actresses",
"cast", "screenplay", "plot", "story", "character", "characters", "scene",
"scenes", "soundtrack", "soundtracks","sequel", "prequel", "adaptation", "trailer", "genre", "genres", "release",
"theaters", "theatre", "performance", "performances", "acting", "blockbuster",
"indie", "oscars", "oscar", "award-winning", "box", "office", "premiere",
"rating", "ratings", "reviews", "animation", "cinematography", "editing", "script", "narrative",
"direction", "score", "soundtrack"
]

stop_words.extend(new_stopwords)
stop_words.remove("not")

In [10]:
stop_words=set(stop_words)
print(len(stop_words))

233


### Data Cleaning and Text Preprocessing Pipeline

*Comprehensive functions for cleaning and preparing text data, including removing special characters, URLs, stopwords, and expanding contractions to enhance the quality of the dataset.*

### Correction of Typos

*Written text often contains errors, such as “Fen” instead of “Fan.” To rectify these errors, a dictionary is employed to map words to their correct forms based on similarity.*

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_typos(content):
    """
    Parameters:
    - content (str): The input text to be corrected.

    Returns:
    - str: The text with corrected typos.
    """
    try:
        words = content.split()
        corrected_words = []
        misspelled = spell.unknown(words)

        for word in words:
            if word in misspelled:
                corrected_word = spell.correction(word)
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)
        
        corrected_content = ' '.join(corrected_words)
        return corrected_content
    except Exception as e:
        print(f"Error in correcting typos: {e}")
        return content

### Mapping and Replacement

*This involves mapping words to standardized language equivalents. For instance, words like “b4” and “ttyl,” commonly understood by humans as “before” and “talk to you later,” pose challenges for machines. Normalization entails mapping such words to their standardized counterparts.*

In [24]:
import re

def load_mapping_dictionary():
    """
    Returns:
    - dict: A dictionary mapping non-standard words to standardized words.
    """
    mapping_dict = {
        "b4": "before",
        "ttyl": "talk to you later",
        "u": "you",
        "r": "are",
        "lol": "laughing out loud",
        "idk": "i do not know",
        "btw": "by the way",
        "omg": "oh my god",
        "imo": "in my opinion",
        "np": "no problem",
        "gr8": "great",
        "l8r": "later",
        "gtg": "got to go",
        "thx": "thanks",
        "pls": "please",
        "plz": "please",
        "bc": "because",
        "cuz": "because",
        "y'all": "you all",
        "luv": "love",
        "wanna": "want to",
        "gonna": "going to",
        "hafta": "have to",
        "kinda": "kind of",
        "sorta": "sort of",
        "gimme": "give me",
        "lemme": "let me",
        "whatcha": "what are you",
        "whaddaya": "what do you",
    }
    return mapping_dict

def normalize_text(content, mapping_dict):
    """
    Parameters:
    - content (str): The input text to be normalized.
    - mapping_dict (dict): A dictionary mapping non-standard words to standardized words.
    
    Returns:
    - str: The normalized text.
    """
    try:
        words = content.split()
        normalized_words = []
        
        for word in words:
            clean_word = re.sub(r'[^\w\s]', '', word.lower())
            if clean_word in mapping_dict:
                replacement = mapping_dict[clean_word]
                normalized_words.append(replacement)
            else:
                normalized_words.append(word)
        
        normalized_content = ' '.join(normalized_words)
        return normalized_content
    except Exception as e:
        print(f"Error in normalizing text: {e}")
        return content

### Expanding Contractions

*Convert contractions like “don’t” to “do not” or “I’ll” to “I will.”*

In [None]:
def contraction_expansion(content):
    """
    Parameters:
    - content (str): The input text.

    Returns:
    - str: Text with expanded contractions.
    """
    contractions = {
        r"won't": "will not",
        r"can't": "cannot",
        r"i'm": "i am",
        r"ain't": "is not",
        r"let's": "let us",
        r"ma'am": "madam",
        r"shan't": "shall not",
        r"n't": " not",
        r"'re": " are",
        r"'s": " is",
        r"'d": " would",
        r"'ll": " will",
        r"'ve": " have",
        r"'m": " am",
        r"he's": "he is",
        r"she's": "she is",
        r"it's": "it is",
        r"that's": "that is",
        r"there's": "there is",
        r"who's": "who is",
        r"what's": "what is",
        r"where's": "where is",
        r"when's": "when is",
        r"why's": "why is",
        r"how's": "how is",
        r"would've": "would have",
        r"could've": "could have",
        r"should've": "should have",
        r"might've": "might have",
        r"must've": "must have",
        r"wouldn't": "would not",
        r"couldn't": "could not",
        r"shouldn't": "should not",
        r"mightn't": "might not",
        r"mustn't": "must not",
        r"don't": "do not",
        r"doesn't": "does not",
        r"didn't": "did not",
        r"hasn't": "has not",
        r"haven't": "have not",
        r"hadn't": "had not",
        r"can't've": "cannot have",
        r"shan't've": "shall not have",
        r"wouldn't've": "would not have",
        r"couldn't've": "could not have",
        r"shouldn't've": "should not have",
        r"mightn't've": "might not have",
        r"mustn't've": "must not have",
        r"i'd": "i would",
        r"i'll": "i will",
        r"i've": "i have",
        r"i'm": "i am",
        r"you'd": "you would",
        r"you'll": "you will",
        r"you've": "you have",
        r"you're": "you are",
        r"he'd": "he would",
        r"he'll": "he will",
        r"he's": "he is",
        r"she'd": "she would",
        r"she'll": "she will",
        r"she's": "she is",
        r"it'd": "it would",
        r"it'll": "it will",
        r"it's": "it is",
        r"they'd": "they would",
        r"they'll": "they will",
        r"they're": "they are",
        r"we'd": "we would",
        r"we'll": "we will",
        r"we're": "we are",
        r"there'd": "there would",
        r"that'd": "that would",
        r"who'd": "who would",
        r"who'll": "who will",
        r"who're": "who are",
        r"what've": "what have",
        r"where've": "where have",
        r"when've": "when have",
        r"why've": "why have",
        r"how've": "how have",
        r"lets": "let us",
        r"lets'": "let us",
        r"gonna": "going to",
        r"wanna": "want to",
        r"gotta": "got to",
        r"oughtn't": "ought not",
        r"needn't": "need not",
        r"daren't": "dare not",
        r"maam": "madam",
        r"gimme": "give me",
        r"lemme": "let me",
        r"whatcha": "what are you",
        r"whaddaya": "what do you",
        r"gimme": "give me",
        r"gonna": "going to",
        r"gotta": "got to",
        r"hafta": "have to",
        r"wanna": "want to",
        r"ain't": "am not",
        r"y'all": "you all",
        r"coulda": "could have",
        r"woulda": "would have",
        r"shoulda": "should have",
        r"'bout": "about",
        r"'til": "until",
        r"can't've": "cannot have",
        r"could've": "could have",
        r"might've": "might have",
        r"must've": "must have",
        r"should've": "should have",
        r"would've": "would have",
        r"you'd've": "you would have",
    }

    for contraction, expanded in contractions.items():
        pattern = re.compile(contraction, flags=re.IGNORECASE)
        content = pattern.sub(expanded, content)
    return content

### Removing Accents and Diacritics
*Sometimes, people use accented characters like é, ö, etc. to signify emphasis on a particular letter during pronunciation. Normalize text by removing accents and diacritical marks from characters.*

In [25]:
import unicodedata

def remove_accents(content):
    """
    Parameters:
    - content (str): The input text to be normalized.
    
    Returns:
    - str: The text with accents and diacritics removed.
    """
    try:
        normalized = unicodedata.normalize('NFKD', content)
        without_accents = ''.join([c for c in normalized if not unicodedata.combining(c)])
        return without_accents
    except Exception as e:
        print(f"Error in removing accents: {e}")
        return content

### Removing Extra Whitespace

*Normalize text by removing extra spaces and leading/trailing spaces*

In [None]:
def remove_extra_whitespace(text):
    """
    Parameters:
    - text (str): The input text to be normalized.
    
    Returns:
    - str: The text with extra spaces removed.
    """
    return ' '.join(text.split())

### Eliminating HTML Tags

*In cases where raw text originates from sources such as web scraping or screen capture, it often carries along HTML tags. These tags introduce unwanted noise and contribute little to the comprehension and analysis of the text. Therefore, it becomes necessary to strip them.*

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    """
    Parameters:
    - text (str): The input text containing HTML tags.
    
    Returns:
    - str: The text with HTML tags removed.
    """
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

### Handling URLs

*Frequently, individuals include URLs, particularly in social media content, to supplement context with additional information. However, URLs tend to vary across samples and can be considered noise.*

In [None]:
import re 

def remove_url(text):
    return re.sub(r"(https|http)?:\S*", "", text)

### Case Standardising and Removing Special Characters

*Special characters are non-alphanumeric characters. The characters like %,$,&, etc are special. In most NLP tasks, these characters add no value to text understanding and induce noise into algorithms. We can use regular expressions to remove special characters.*

In [None]:
def remove_special_character(content):
    content = content.lower()
    return re.sub(r'\W+',' ', content)

### Removing Stopwords

*In most cases, stopwords like I, am, me, etc. don’t add any information that can help in modeling. Keeping them in the text introduces unnecessary noise and can significantly increase the dimensionality of feature vectors, which can negatively impact both computation cost and model accuracy.*

In [None]:
def remove_stopwords(content):
    clean_data = []
    for i in content.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

In [None]:
def data_cleaning(content):
    
    content = correct_typos(content)
    mapping_dict = load_mapping_dictionary()
    content = normalize_text(content, mapping_dict)
    content = remove_extra_whitespace(content)
    content = remove_url(content)
    content = remove_html_tags(content)
    content = remove_special_character(content)
    content = contraction_expansion(content)
    content = remove_accents(content)
    content = remove_stopwords(content)
    return content