# Step 2: Data Preprocessing and Text Cleaning

This notebook covers the second step of our sentiment analysis project:
- Loading the dataset from Step 1
- Text cleaning and preprocessing
- Removing stopwords, punctuation, and special characters
- Converting text to lowercase
- Tokenization

## Import Required Libraries

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text processing
import re
import string

# NLTK for natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## Load Dataset

In [None]:
# Load the dataset created in Step 1
df = pd.read_csv('../data/imdb_reviews.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## Text Preprocessing Functions

Let's create functions to clean and preprocess the text data.

In [None]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean and preprocess text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def remove_stopwords(text):
    """
    Remove stopwords from text
    """
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    """
    Apply stemming to text
    """
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def preprocess_text(text):
    """
    Complete text preprocessing pipeline
    """
    # Clean text
    text = clean_text(text)
    
    # Remove stopwords
    text = remove_stopwords(text)
    
    # Apply stemming
    text = stem_text(text)
    
    return text

print("Text preprocessing functions defined successfully!")

## Test Preprocessing Functions

Let's test our preprocessing functions on sample text.

In [None]:
# Test preprocessing on a sample review
sample_text = "This movie was absolutely fantastic! The acting was superb and the plot was engaging throughout."

print("Original text:")
print(sample_text)
print("\nAfter cleaning:")
cleaned = clean_text(sample_text)
print(cleaned)
print("\nAfter removing stopwords:")
no_stopwords = remove_stopwords(cleaned)
print(no_stopwords)
print("\nAfter stemming:")
stemmed = stem_text(no_stopwords)
print(stemmed)
print("\nComplete preprocessing:")
preprocessed = preprocess_text(sample_text)
print(preprocessed)

## Apply Preprocessing to Dataset

In [None]:
# Apply preprocessing to all reviews
print("Applying preprocessing to all reviews...")

# Create a copy of the dataframe
df_processed = df.copy()

# Apply preprocessing
df_processed['cleaned_review'] = df_processed['review'].apply(preprocess_text)

print("Preprocessing completed!")
print(f"\nDataset shape: {df_processed.shape}")
print(f"New columns: {df_processed.columns.tolist()}")

In [None]:
# Compare original vs cleaned reviews
print("Comparison of Original vs Cleaned Reviews:")
print("=" * 50)

for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Original: {df_processed.iloc[i]['review']}")
    print(f"Cleaned:  {df_processed.iloc[i]['cleaned_review']}")
    print("-" * 40)

## Text Statistics After Preprocessing

In [None]:
# Calculate text statistics
df_processed['original_length'] = df_processed['review'].str.len()
df_processed['cleaned_length'] = df_processed['cleaned_review'].str.len()
df_processed['original_words'] = df_processed['review'].str.split().str.len()
df_processed['cleaned_words'] = df_processed['cleaned_review'].str.split().str.len()

print("Text Statistics Summary:")
print("=" * 30)
print(f"Average original text length: {df_processed['original_length'].mean():.1f} characters")
print(f"Average cleaned text length: {df_processed['cleaned_length'].mean():.1f} characters")
print(f"Average original word count: {df_processed['original_words'].mean():.1f} words")
print(f"Average cleaned word count: {df_processed['cleaned_words'].mean():.1f} words")

# Reduction in text size
char_reduction = (1 - df_processed['cleaned_length'].mean() / df_processed['original_length'].mean()) * 100
word_reduction = (1 - df_processed['cleaned_words'].mean() / df_processed['original_words'].mean()) * 100

print(f"\nReduction after preprocessing:")
print(f"Character reduction: {char_reduction:.1f}%")
print(f"Word reduction: {word_reduction:.1f}%")

In [None]:
# Visualize the distribution of text lengths
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original text length distribution
axes[0, 0].hist(df_processed['original_length'], bins=20, alpha=0.7, color='blue')
axes[0, 0].set_title('Original Text Length Distribution')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')

# Cleaned text length distribution
axes[0, 1].hist(df_processed['cleaned_length'], bins=20, alpha=0.7, color='green')
axes[0, 1].set_title('Cleaned Text Length Distribution')
axes[0, 1].set_xlabel('Characters')
axes[0, 1].set_ylabel('Frequency')

# Original word count distribution
axes[1, 0].hist(df_processed['original_words'], bins=20, alpha=0.7, color='red')
axes[1, 0].set_title('Original Word Count Distribution')
axes[1, 0].set_xlabel('Words')
axes[1, 0].set_ylabel('Frequency')

# Cleaned word count distribution
axes[1, 1].hist(df_processed['cleaned_words'], bins=20, alpha=0.7, color='orange')
axes[1, 1].set_title('Cleaned Word Count Distribution')
axes[1, 1].set_xlabel('Words')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Save Preprocessed Data

In [None]:
# Save the preprocessed dataset
output_file = '../data/preprocessed_reviews.csv'

# Select only necessary columns for next steps
final_df = df_processed[['review', 'cleaned_review', 'sentiment']].copy()

# Save to CSV
final_df.to_csv(output_file, index=False)

print(f"Preprocessed dataset saved to: {output_file}")
print(f"Final dataset shape: {final_df.shape}")
print(f"Columns: {final_df.columns.tolist()}")

# Display sample of final dataset
print("\nSample of preprocessed data:")
final_df.head()

In [None]:
# Summary of Step 2
print("\n=== STEP 2 COMPLETED ===")
print("✓ Text cleaning and preprocessing functions created")
print("✓ Applied lowercase conversion")
print("✓ Removed punctuation and special characters")
print("✓ Removed stopwords using NLTK")
print("✓ Applied stemming to reduce words to root forms")
print("✓ Generated text statistics and visualizations")
print("✓ Saved preprocessed dataset for next steps")
print(f"✓ Dataset ready with {len(final_df)} cleaned reviews")
print("\nNext: Feature extraction using TF-IDF vectorization")