In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

# Load the dataset
file_path = r'C:\Users\subha\Documents\m.tech\AIPP\assignment-17\movie_reviews-1.csv'
df = pd.read_csv(file_path)

print("Original dataset shape:", df.shape)
print("\nOriginal columns:", df.columns.tolist())
print("\nFirst few rows of original data:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())


Original dataset shape: (15, 3)

Original columns: ['review_id', 'review_text', 'rating']

First few rows of original data:
   review_id                review_text  rating
0          1      <p>Amazing movie!</p>     8.0
1          2  Terrible acting & plot!!!     2.0
2          3      <p>Amazing movie!</p>     NaN
3          4  Terrible acting & plot!!!     8.0
4          5      <p>Amazing movie!</p>     5.0

Data types:
review_id        int64
review_text     object
rating         float64
dtype: object

Missing values:
review_id      0
review_text    0
rating         2
dtype: int64


In [6]:
# Get stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean text by removing HTML tags, punctuation, special symbols, and stopwords.
    """
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and punctuation, keep only alphanumeric and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and single character tokens
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    # Join tokens back
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Clean the review_text column
print("Cleaning review text...")
df['review_text_cleaned'] = df['review_text'].apply(clean_text)

print("\nSample of cleaned text:")
print(df[['review_text', 'review_text_cleaned']].head(10))


Cleaning review text...

Sample of cleaned text:
                 review_text   review_text_cleaned
0      <p>Amazing movie!</p>         amazing movie
1  Terrible acting & plot!!!  terrible acting plot
2      <p>Amazing movie!</p>         amazing movie
3  Terrible acting & plot!!!  terrible acting plot
4      <p>Amazing movie!</p>         amazing movie
5  Terrible acting & plot!!!  terrible acting plot
6      <p>Amazing movie!</p>         amazing movie
7  Terrible acting & plot!!!  terrible acting plot
8      <p>Amazing movie!</p>         amazing movie
9  Terrible acting & plot!!!  terrible acting plot


In [8]:
# Handle missing values in numerical columns
print("Handling missing values in numerical columns...")

# Identify numerical columns (excluding review_id if it's just an identifier)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'review_id' in numerical_cols:
    numerical_cols.remove('review_id')

print("Numerical columns found:", numerical_cols)

# Fill missing values in numerical columns with median (more robust than mean)
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
        print(f"Filled {df[col].isnull().sum()} missing values in '{col}' with median: {median_value}")

print("\nMissing values after handling:")
print(df[numerical_cols].isnull().sum())


Handling missing values in numerical columns...
Numerical columns found: ['rating']

Missing values after handling:
rating    0
dtype: int64


In [10]:
# Convert date/timestamp columns to datetime format and extract features
print("Checking for date/timestamp columns...")

# Check all columns for potential date/timestamp format
date_columns = []
for col in df.columns:
    # Skip if it's review_id or text columns
    if col in ['review_id', 'review_text', 'review_text_cleaned']:
        continue
    
    # Try to detect if column contains date-like strings
    sample_values = df[col].dropna().head(10)
    if len(sample_values) > 0:
        # Check if values look like dates
        date_patterns = [
            r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
            r'\d{2}/\d{2}/\d{4}',  # MM/DD/YYYY
            r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',  # YYYY-MM-DD HH:MM:SS
        ]
        for pattern in date_patterns:
            if sample_values.astype(str).str.contains(pattern, regex=True).any():
                date_columns.append(col)
                break

# Also check column names that might indicate dates
potential_date_cols = [col for col in df.columns if any(keyword in col.lower() 
                       for keyword in ['date', 'time', 'timestamp', 'created', 'updated'])]

date_columns = list(set(date_columns + potential_date_cols))

if date_columns:
    print(f"Found potential date columns: {date_columns}")
    for col in date_columns:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            # Extract hour if it's a datetime
            df[f'{col}_hour'] = df[col].dt.hour
            # Extract weekday (0=Monday, 6=Sunday)
            df[f'{col}_weekday'] = df[col].dt.dayofweek
            # Extract day name
            df[f'{col}_weekday_name'] = df[col].dt.day_name()
            print(f"Converted '{col}' to datetime and extracted hour and weekday features")
        except Exception as e:
            print(f"Could not convert '{col}' to datetime: {e}")
else:
    print("No date/timestamp columns found in the dataset")


Checking for date/timestamp columns...
No date/timestamp columns found in the dataset


In [12]:
# Detect and remove duplicate reviews
print("Detecting duplicate reviews...")

# Check for exact duplicates based on review_text
duplicate_text = df.duplicated(subset=['review_text'], keep='first')
print(f"Found {duplicate_text.sum()} exact duplicate reviews based on review_text")

# Check for duplicates based on cleaned text
duplicate_cleaned = df.duplicated(subset=['review_text_cleaned'], keep='first')
print(f"Found {duplicate_cleaned.sum()} duplicate reviews based on cleaned text")

# Remove duplicates based on cleaned text (more effective for detecting spam)
df_cleaned = df[~duplicate_cleaned].copy()

print(f"\nOriginal shape: {df.shape}")
print(f"After removing duplicates: {df_cleaned.shape}")
print(f"Removed {df.shape[0] - df_cleaned.shape[0]} duplicate reviews")

# Detect potential spam reviews
# Spam indicators: very short reviews, repeated words, all caps, etc.
def detect_spam(text):
    """
    Detect potential spam reviews based on various indicators.
    """
    if pd.isna(text) or text == "":
        return True
    
    text = str(text)
    
    # Very short reviews (less than 3 words)
    if len(text.split()) < 3:
        return True
    
    # Check for excessive repetition (same word repeated many times)
    words = text.split()
    if len(words) > 0:
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
        max_repetition = max(word_counts.values())
        if max_repetition > len(words) * 0.5:  # More than 50% repetition
            return True
    
    return False

# Mark spam reviews
df_cleaned['is_spam'] = df_cleaned['review_text_cleaned'].apply(detect_spam)
spam_count = df_cleaned['is_spam'].sum()
print(f"\nDetected {spam_count} potential spam reviews")

# Remove spam reviews
df_final = df_cleaned[~df_cleaned['is_spam']].copy()
df_final = df_final.drop('is_spam', axis=1)

print(f"After removing spam: {df_final.shape}")
print(f"Total removed (duplicates + spam): {df.shape[0] - df_final.shape[0]} reviews")


Detecting duplicate reviews...
Found 13 exact duplicate reviews based on review_text
Found 13 duplicate reviews based on cleaned text

Original shape: (15, 4)
After removing duplicates: (2, 4)
Removed 13 duplicate reviews

Detected 1 potential spam reviews
After removing spam: (1, 4)
Total removed (duplicates + spam): 14 reviews


In [13]:
# Display the processed dataset
print("=" * 80)
print("PROCESSED DATASET")
print("=" * 80)
print(f"\nFinal dataset shape: {df_final.shape}")
print(f"\nColumns: {df_final.columns.tolist()}")
print("\n" + "=" * 80)
print("First few rows of processed dataset:")
print("=" * 80)
print(df_final.head(10))

print("\n" + "=" * 80)
print("Dataset Summary:")
print("=" * 80)
print(df_final.info())

print("\n" + "=" * 80)
print("Missing values in final dataset:")
print("=" * 80)
print(df_final.isnull().sum())

# Return the processed dataset
processed_dataset = df_final.copy()
print("\n" + "=" * 80)
print("Dataset cleaning completed successfully!")
print("=" * 80)


PROCESSED DATASET

Final dataset shape: (1, 4)

Columns: ['review_id', 'review_text', 'rating', 'review_text_cleaned']

First few rows of processed dataset:
   review_id                review_text  rating   review_text_cleaned
1          2  Terrible acting & plot!!!     2.0  terrible acting plot

Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 1 to 1
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1 non-null      int64  
 1   review_text          1 non-null      object 
 2   rating               1 non-null      float64
 3   review_text_cleaned  1 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 40.0+ bytes
None

Missing values in final dataset:
review_id              0
review_text            0
rating                 0
review_text_cleaned    0
dtype: int64

Dataset cleaning completed successfully!
