<a href="https://colab.research.google.com/github/Ehtisham1053/Natural-Language-Processing/blob/main/Text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random

# Define sample movie names, genres, and example review components
movie_names = ["Titanic", "The Conjuring", "Avatar", "Inception", "La La Land", "The Matrix", "Interstellar",
               "The Godfather", "Parasite", "Joker", "Shutter Island", "The Dark Knight", "Pulp Fiction",
               "Fight Club", "Forrest Gump", "Gladiator", "The Revenant", "Avengers", "Spider-Man", "Deadpool"]
genres = ["Romantic", "Horror", "Sci-Fi", "Thriller", "Musical", "Action", "Drama", "Mystery", "Comedy"]

# Sample text patterns to introduce preprocessing challenges
html_tags = ["<b>amazing</b>", "<i>scary</i>", "<div>must watch</div>"]
urls = ["https://movies.com", "www.watchnow.com", "http://imdb.com"]
abbreviations = ["gr8", "omg", "u", "btw", "luv", "wont", "dont", "hes", "shes", "cant", "im"]
emojis = ["üíñ", "üò≠", "üò®", "ü§Ø", "üé•", "‚ù§Ô∏è", "üåü", "üòÇ", "üî•", "üòé"]

# Generate 100 rows of movie reviews
data = []
for _ in range(100):
    movie = random.choice(movie_names)
    genre = random.choice(genres)
    review = f"This movie was {random.choice(html_tags)} {random.choice(emojis)}. "
    review += f"Totally {random.choice(['mind-blowing!', 'terrifying!', 'boring!', 'thrilling!', 'unexpected!'])} "
    review += f"{random.choice(urls)} "
    review += f"I think it's {random.choice(abbreviations)}!"

    data.append([movie, review, genre])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Movie Name", "Review", "Genre"])

# Save to CSV
file_path = "movie_reviews_dataset.csv"
df.to_csv(file_path, index=False)

file_path


'movie_reviews_dataset.csv'

In [2]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Movie Name,Review,Genre
0,Fight Club,This movie was <b>amazing</b> üòÇ. Totally terri...,Drama
1,Deadpool,This movie was <i>scary</i> ü§Ø. Totally mind-bl...,Horror
2,Interstellar,This movie was <b>amazing</b> ‚ù§Ô∏è. Totally thri...,Musical
3,Avengers,This movie was <i>scary</i> üåü. Totally unexpec...,Action
4,La La Land,This movie was <b>amazing</b> ‚ù§Ô∏è. Totally bori...,Drama


## 2. Lowercasing

In [3]:
df["Review"] = df["Review"].str.lower()


## 3. Removing HTML Tags
use the BeautifulSoup

In [4]:
from bs4 import BeautifulSoup

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["Review"] = df["Review"].apply(remove_html)


## 4. Removing URLs

In [5]:
import re

def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

df["Review"] = df["Review"].apply(remove_urls)


## 5. Removing Punctuation

In [6]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

df["Review"] = df["Review"].apply(remove_punctuation)


## 6. Handling Short/chat Words (Expanding Abbreviations)

In [7]:
short_words = {
    "u": "you", "r": "are", "ur": "your", "btw": "by the way",
    "idk": "i do not know", "omg": "oh my god", "imo": "in my opinion",
    "lol": "laugh out loud", "gn": "good night", "brb": "be right back"
}

def expand_short_words(text):
    words = text.split()
    return " ".join([short_words[word] if word in short_words else word for word in words])

df["Review"] = df["Review"].apply(expand_short_words)


## 7. Spell Checking

In [8]:
from textblob import TextBlob

def correct_spelling(text):
    return str(TextBlob(text).correct())

df["Review"] = df["Review"].apply(correct_spelling)


## 8. Removing Stop Words

In [9]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stop_words])

df["Review"] = df["Review"].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 9. Handling Emojis

we replace the emoji instead of removing them

In [10]:
emoji_dict = {
    "üíñ": "love", "üò≠": "crying", "üò®": "scared", "ü§Ø": "mind-blown", "üé•": "movie",
    "‚ù§Ô∏è": "heart", "üåü": "star", "üòÇ": "laughing", "üî•": "fire", "üòé": "cool"
}

def replace_emojis(text):
    for emoji_char, meaning in emoji_dict.items():
        text = text.replace(emoji_char, meaning)
    return text

df["Review"] = df["Review"].apply(replace_emojis)


## 10. Tokenization

word tokenization

In [12]:
import nltk
from nltk.tokenize import word_tokenize

# Download the 'punkt_tab' data package for sentence tokenization
nltk.download('punkt_tab')

# Now apply word_tokenize to your DataFrame column
df["Tokenized Review"] = df["Review"].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## 11. Stemming

Reduce words to their root form using PorterStemmer.

In [13]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def apply_stemming(words):
    return [stemmer.stem(word) for word in words]

df["Stemmed Review"] = df["Tokenized Review"].apply(apply_stemming)


## 12. Lemmatization

Reduce words to their base form using WordNet Lemmatizer.

In [14]:
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(words):
    return [lemmatizer.lemmatize(word) for word in words]

df["Lemmatized Review"] = df["Tokenized Review"].apply(apply_lemmatization)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [15]:
df.to_csv("movie_reviews_preprocessed.csv", index=False)


In [16]:
df1 = pd.read_csv("movie_reviews_preprocessed.csv")
df1.head()

Unnamed: 0,Movie Name,Review,Genre,Tokenized Review,Stemmed Review,Lemmatized Review
0,Fight Club,movie amazing laughing totally terrifying think,Drama,"['movie', 'amazing', 'laughing', 'totally', 't...","['movi', 'amaz', 'laugh', 'total', 'terrifi', ...","['movie', 'amazing', 'laughing', 'totally', 't..."
1,Deadpool,movie scar mind-blown totally mindblowing thin...,Horror,"['movie', 'scar', 'mind-blown', 'totally', 'mi...","['movi', 'scar', 'mind-blown', 'total', 'mindb...","['movie', 'scar', 'mind-blown', 'totally', 'mi..."
2,Interstellar,movie amazing heart totally thrilling think,Musical,"['movie', 'amazing', 'heart', 'totally', 'thri...","['movi', 'amaz', 'heart', 'total', 'thrill', '...","['movie', 'amazing', 'heart', 'totally', 'thri..."
3,Avengers,movie scar star totally unexpected think oh god,Action,"['movie', 'scar', 'star', 'totally', 'unexpect...","['movi', 'scar', 'star', 'total', 'unexpect', ...","['movie', 'scar', 'star', 'totally', 'unexpect..."
4,La La Land,movie amazing heart totally boring think way,Drama,"['movie', 'amazing', 'heart', 'totally', 'bori...","['movi', 'amaz', 'heart', 'total', 'bore', 'th...","['movie', 'amazing', 'heart', 'totally', 'bori..."
