In [1]:
# Data Preprocessing for NLP Pipeline
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
import pickle

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
print("LOADING DATA")
print("="*50)
df = pd.read_csv('../data/raw/IMDB_Dataset.csv')
print(f"Original dataset shape: {df.shape}")

LOADING DATA
Original dataset shape: (50000, 2)


In [3]:
# Remove duplicates
print("\n" + "="*50)
print("REMOVING DUPLICATES")
print("="*50)
print(f"Duplicates found: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")


REMOVING DUPLICATES
Duplicates found: 418
Shape after removing duplicates: (49582, 2)


In [4]:
# Handle missing values
print("\n" + "="*50)
print("HANDLING MISSING VALUES")
print("="*50)
print(f"Missing values:\n{df.isnull().sum()}")
df = df.dropna()
print(f"Shape after removing missing values: {df.shape}")


HANDLING MISSING VALUES
Missing values:
review       0
sentiment    0
dtype: int64
Shape after removing missing values: (49582, 2)


In [5]:
# Text Preprocessing Functions
def remove_html_tags(text):
    """Remove HTML tags from text"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_urls(text):
    """Remove URLs from text"""
    return re.sub(r'http\S+|www.\S+', '', text)

def remove_special_chars(text):
    """Remove special characters and digits"""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def convert_to_lowercase(text):
    """Convert text to lowercase"""
    return text.lower()

def remove_extra_whitespace(text):
    """Remove extra whitespace"""
    return ' '.join(text.split())

def remove_stopwords(text):
    """Remove stopwords"""
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

def lemmatize_text(text):
    """Lemmatize text"""
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words])

def stem_text(text):
    """Stem text"""
    stemmer = PorterStemmer()
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

In [6]:
# Apply preprocessing pipeline
print("\n" + "="*50)
print("PREPROCESSING TEXT")
print("="*50)

# Create a copy for preprocessing
df['cleaned_review'] = df['review'].copy()

print("Step 1: Removing HTML tags...")
df['cleaned_review'] = df['cleaned_review'].apply(remove_html_tags)

print("Step 2: Removing URLs...")
df['cleaned_review'] = df['cleaned_review'].apply(remove_urls)

print("Step 3: Converting to lowercase...")
df['cleaned_review'] = df['cleaned_review'].apply(convert_to_lowercase)

print("Step 4: Removing special characters and digits...")
df['cleaned_review'] = df['cleaned_review'].apply(remove_special_chars)

print("Step 5: Removing extra whitespace...")
df['cleaned_review'] = df['cleaned_review'].apply(remove_extra_whitespace)

print("Step 6: Removing stopwords...")
df['cleaned_review'] = df['cleaned_review'].apply(remove_stopwords)

print("Step 7: Lemmatizing text...")
df['cleaned_review'] = df['cleaned_review'].apply(lemmatize_text)

print("\n✓ Preprocessing completed!")


PREPROCESSING TEXT
Step 1: Removing HTML tags...
Step 2: Removing URLs...
Step 3: Converting to lowercase...
Step 4: Removing special characters and digits...
Step 5: Removing extra whitespace...
Step 6: Removing stopwords...
Step 7: Lemmatizing text...

✓ Preprocessing completed!


In [7]:
# Display sample
print("\n" + "="*50)
print("SAMPLE COMPARISON")
print("="*50)
print("\nOriginal Review:")
print(df['review'].iloc[0][:200] + "...")
print("\nCleaned Review:")
print(df['cleaned_review'].iloc[0][:200] + "...")


SAMPLE COMPARISON

Original Review:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo...

Cleaned Review:
one reviewer mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pu...


In [8]:
# Encode labels
print("\n" + "="*50)
print("ENCODING LABELS")
print("="*50)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(f"Label distribution:\n{df['label'].value_counts()}")


ENCODING LABELS
Label distribution:
label
1    24884
0    24698
Name: count, dtype: int64


In [None]:
# Remove empty reviews after preprocessing
df = df[df['cleaned_review'].str.strip().str.len() > 0]
print(f"\nShape after removing empty reviews: {df.shape}")

In [10]:
# Split data
print("\n" + "="*50)
print("SPLITTING DATA")
print("="*50)

X = df['cleaned_review']
y = df['label']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

print(f"\nTraining set distribution:\n{y_train.value_counts()}")
print(f"\nValidation set distribution:\n{y_val.value_counts()}")
print(f"\nTest set distribution:\n{y_test.value_counts()}")


SPLITTING DATA
Training set size: 34707
Validation set size: 7437
Test set size: 7438

Training set distribution:
label
1    17419
0    17288
Name: count, dtype: int64

Validation set distribution:
label
1    3732
0    3705
Name: count, dtype: int64

Test set distribution:
label
1    3733
0    3705
Name: count, dtype: int64


In [11]:
# Save processed data
print("\n" + "="*50)
print("SAVING PROCESSED DATA")
print("="*50)

# Save full processed dataset
df[['review', 'cleaned_review', 'sentiment', 'label']].to_csv(
    '../data/processed/processed_data.csv', index=False
)
print("✓ Full processed data saved")


SAVING PROCESSED DATA
✓ Full processed data saved


In [12]:
# Save train/val/test splits
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
val_df = pd.DataFrame({'text': X_val, 'label': y_val})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

train_df.to_csv('../data/processed/train.csv', index=False)
val_df.to_csv('../data/processed/val.csv', index=False)
test_df.to_csv('../data/processed/test.csv', index=False)

print("✓ Train/Val/Test splits saved")

✓ Train/Val/Test splits saved


In [13]:
# Save preprocessing configuration
preprocessing_config = {
    'steps': [
        'remove_html_tags',
        'remove_urls',
        'convert_to_lowercase',
        'remove_special_chars',
        'remove_extra_whitespace',
        'remove_stopwords',
        'lemmatize_text'
    ],
    'label_encoding': {'positive': 1, 'negative': 0}
}

with open('../models/preprocessing_config.pkl', 'wb') as f:
    pickle.dump(preprocessing_config, f)

print("✓ Preprocessing configuration saved")

print("\n" + "="*50)
print("PREPROCESSING COMPLETE")
print("="*50)


✓ Preprocessing configuration saved

PREPROCESSING COMPLETE
