# Movie Dataset - Exploratory Data Analysis (EDA)




## 1. Data Loading and Initial Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Loading dataset...")
# Load only first 10,000 rows for analysis (full dataset is 946K rows)
# Using latin-1 encoding to handle special characters
df = pd.read_csv('data_movies_clean.csv', on_bad_lines='skip', encoding='latin-1', low_memory=False)

# CLEANING STEP: Remove rows where the title is missing
# This removes the "nan" spike and ensures a uniform distribution
df = df.dropna(subset=['title'])
df.columns = [col.split(';')[0] if isinstance(col, str) else col for col in df.columns]

# Coerce numeric columns to fix mixed types
numeric_cols = ['budget', 'revenue', 'runtime', 'vote_average']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
print(f"Loaded {len(df)} movies (after cleaning missing titles)")


### Dataset Overview

In [None]:
# Basic information
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

## 2. Key Columns for DHT Implementation

In [None]:
# Columns used in DHT
print("Key Columns for DHT:")
print("\n1. PRIMARY KEY: 'title' - Used as DHT key for hashing")
print("\n2. SEARCHABLE ATTRIBUTES (B+ tree indexed):")
print("   - popularity (numeric)")
print("   - vote_average (numeric)")
print("   - release_date (temporal)")
print("\n3. METADATA ATTRIBUTES:")
print("   - budget, revenue, runtime")
print("   - genre_names, production_company_names")
print("   - original_language, origin_country")

## 3. Data Distribution Analysis

### 3.1 Popularity Distribution

In [None]:
# Popularity distribution
if 'popularity' in df.columns:
    plt.figure(figsize=(14, 5))
    
    # Histogram
    plt.subplot(1, 2, 1)
    plt.hist(df['popularity'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel('Popularity Score')
    plt.ylabel('Frequency')
    plt.title('Popularity Distribution (Full Range)')
    plt.grid(True, alpha=0.3)
    
    # Log scale for better visualization
    plt.subplot(1, 2, 2)
    plt.hist(df['popularity'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel('Popularity Score')
    plt.ylabel('Frequency (log scale)')
    plt.title('Popularity Distribution (Log Scale)')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Popularity Statistics:")
    print(f"  Mean: {df['popularity'].mean():.2f}")
    print(f"  Median: {df['popularity'].median():.2f}")
    print(f"  Min: {df['popularity'].min():.2f}")
    print(f"  Max: {df['popularity'].max():.2f}")
else:
    print("'popularity' column not found")

### 3.2 Rating Distribution

In [None]:
# Rating distribution
if 'vote_average' in df.columns:
    plt.figure(figsize=(10, 5))
    
    plt.hist(df['vote_average'].dropna(), bins=30, edgecolor='black', alpha=0.7, color='coral')
    plt.xlabel('Average Rating')
    plt.ylabel('Frequency')
    plt.title('Movie Rating Distribution')
    plt.axvline(df['vote_average'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["vote_average"].mean():.2f}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Rating Statistics:")
    print(f"  Mean: {df['vote_average'].mean():.2f}")
    print(f"  Median: {df['vote_average'].median():.2f}")
    print(f"  Std Dev: {df['vote_average'].std():.2f}")
else:
    print("'vote_average' column not found")

### 3.3 Release Year Analysis

In [None]:
# Extract year from release_date
if 'release_date' in df.columns:
    df['year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
    
    plt.figure(figsize=(14, 5))
    
    # Movies per year
    year_counts = df['year'].value_counts().sort_index()
    plt.plot(year_counts.index, year_counts.values, linewidth=2, color='steelblue')
    plt.xlabel('Year')
    plt.ylabel('Number of Movies')
    plt.title('Movie Production Over Time')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Year Range: {df['year'].min():.0f} - {df['year'].max():.0f}")
    print(f"Most productive year: {year_counts.idxmax():.0f} ({year_counts.max()} movies)")
else:
    print("'release_date' column not found")

### 3.4 Budget and Revenue Analysis

In [None]:
# Budget vs Revenue
if 'budget' in df.columns and 'revenue' in df.columns:
    # Filter out zero values
    df_filtered = df[(df['budget'] > 0) & (df['revenue'] > 0)].copy()
    
    plt.figure(figsize=(10, 6))
    plt.scatter(df_filtered['budget'], df_filtered['revenue'], alpha=0.5, s=30)
    plt.xlabel('Budget ($)')
    plt.ylabel('Revenue ($)')
    plt.title('Budget vs Revenue')
    plt.xscale('log')
    plt.yscale('log')
    
    # Add diagonal line (break-even)
    max_val = max(df_filtered['budget'].max(), df_filtered['revenue'].max())
    plt.plot([1, max_val], [1, max_val], 'r--', linewidth=2, label='Break-even')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # ROI calculation
    df_filtered.loc[:, 'roi'] = (df_filtered['revenue'] - df_filtered['budget']) / df_filtered['budget'] * 100
    print(f"\nROI Statistics (non-zero budget/revenue):")
    print(f"  Average ROI: {df_filtered['roi'].mean():.1f}%")
    print(f"  Median ROI: {df_filtered['roi'].median():.1f}%")
    print(f"  Profitable movies: {(df_filtered['roi'] > 0).sum()} / {len(df_filtered)} ({(df_filtered['roi'] > 0).sum() / len(df_filtered) * 100:.1f}%)")
else:
    print("'budget' or 'revenue' columns not found")

## 4. DHT Hash Distribution Simulation

In [None]:
# Simulate DHT hashing
import hashlib

def hash_title(title, m_bits=160):
    """Hash movie title using SHA-1"""
    normalized = str(title).strip().lower()
    hash_obj = hashlib.sha1(normalized.encode('utf-8'))
    hash_int = int(hash_obj.hexdigest(), 16)
    ring_size = 2 ** m_bits
    return hash_int % ring_size

# Hash all movie titles
sample_df = df.copy()
sample_df['hash_id'] = sample_df['title'].apply(lambda x: hash_title(x, m_bits=32))  # Use 32-bit for visualization

print(f"Hashed {len(sample_df)} movie titles")
print(f"Hash ID range: 0 to {2**32 - 1}")
print(f"\nSample hashes:")
print(sample_df[['title', 'hash_id']].head(10))

In [None]:
# Visualize hash distribution
plt.figure(figsize=(14, 5))

# Histogram of hash values
plt.subplot(1, 2, 1)
plt.hist(sample_df['hash_id'], bins=50, edgecolor='black', alpha=0.7, color='purple')
plt.xlabel('Hash ID (32-bit)')
plt.ylabel('Frequency')
plt.title('Distribution of Hashed Movie Titles')
plt.annotate('Cleaning NaN titles ensures uniformity', xy=(0.5, 0.95), xycoords='axes fraction', ha='center', color='darkred')
plt.grid(True, alpha=0.3)

# Sorted hash IDs (shows uniform distribution)
plt.subplot(1, 2, 2)
sorted_hashes = sorted(sample_df['hash_id'])
plt.plot(range(len(sorted_hashes)), sorted_hashes, linewidth=1)
plt.xlabel('Movie Index (sorted)')
plt.ylabel('Hash ID')
plt.title('Sorted Hash IDs (Should be Linear for Uniform Distribution)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nHash distribution is uniform")

## 5. Top Movies Analysis

In [None]:
# Top movies by popularity
if 'popularity' in df.columns and 'title' in df.columns:
    top_popular = df.nlargest(15, 'popularity')[['title', 'popularity']]
    
    plt.figure(figsize=(12, 6))
    plt.barh(range(len(top_popular)), top_popular['popularity'], color='skyblue', edgecolor='black')
    plt.yticks(range(len(top_popular)), top_popular['title'], fontsize=10)
    plt.xlabel('Popularity Score')
    plt.title('Top 15 Most Popular Movies')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 15 Most Popular Movies:")
    print(top_popular.to_string(index=False))

## 6. Data Suitability for DHT

In [None]:
print("=" * 70)
print("DATASET SUITABILITY FOR DHT IMPLEMENTATION")
print("=" * 70)

print("\n PRIMARY KEY (title):")
print(f"  - Total movies: {len(df):,}")
print(f"  - Unique titles: {df['title'].nunique():,}")
print(f"  - Uniqueness: {df['title'].nunique() / len(df) * 100:.1f}%")

if 'popularity' in df.columns:
    print("\n  SEARCHABLE ATTRIBUTE (popularity):")
    print(f"  - Non-null values: {df['popularity'].notna().sum():,}")
    print(f"  - Range: {df['popularity'].min():.2f} to {df['popularity'].max():.2f}")
    print(f"  - Good for range queries: YES")

print("\n HASH DISTRIBUTION:")
print(f"  - Titles hashed: {len(sample_df):,}")
print(f"  - Distribution: UNIFORM (good for load balancing)")
print(f"  - Hash collisions: MINIMAL (SHA-1 160-bit)")

## 7. Summary Statistics for Report


In [None]:
# Generate summary for experimental report
summary = {
    'Total Movies (sample)': len(df),
    'Unique Titles': df['title'].nunique(),
    'Columns': len(df.columns),
}

if 'popularity' in df.columns:
    summary['Avg Popularity'] = f"{df['popularity'].mean():.2f}"

if 'year' in df.columns:
    summary['Year Range'] = f"{df['year'].min():.0f} - {df['year'].max():.0f}"

print("\nDataset Summary for Report:")
print("-" * 50)
for key, value in summary.items():
    print(f"{key:<30}: {value}")
print("-" * 50)

## 8. DHT Performance & Experimental Evaluation


### 8.1 Overall Latency Comparison (Chord vs Pastry)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os

# Display the main performance comparison
if os.path.exists('instances/performance_comparison_bars.png'):
    plt.figure(figsize=(15, 10))
    img = mpimg.imread('instances/performance_comparison_bars.png')
    plt.imshow(img)
    plt.axis('off')
    plt.title("Chord vs Pastry: Operation Latency Comparison", fontsize=16)
    plt.show()
else:
    print("Performance plot 'instances/performance_comparison_bars.png' not found.")

### 8.2 Scaling Analysis


In [None]:
# Showing the Join and Lookup scaling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

if os.path.exists('instances/scaling_node_join.png'):
    ax1.imshow(mpimg.imread('instances/scaling_node_join.png'))
    ax1.axis('off')
    ax1.set_title("Node Join Scaling")

if os.path.exists('instances/scaling_lookup.png'):
    ax2.imshow(mpimg.imread('instances/scaling_lookup.png'))
    ax2.axis('off')
    ax2.set_title("Lookup Latency Scaling")

plt.tight_layout()
plt.show()

### 8.3 Concurrency & K-Parameter Evaluation


In [None]:
# Display concurrency scaling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

if os.path.exists('instances/concurrency_scaling.png'):
    ax1.imshow(mpimg.imread('instances/concurrency_scaling.png'))
    ax1.axis('off')
    ax1.set_title("Total Time vs K (Concurrent Lookups)")

if os.path.exists('instances/concurrency_avg_latency.png'):
    ax2.imshow(mpimg.imread('instances/concurrency_avg_latency.png'))
    ax2.axis('off')
    ax2.set_title("Average Latency per Movie vs K")

plt.tight_layout()
plt.show()