In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hybrid Recommendation System - Data Exploration\n",
    "\n",
    "**Author:** DevanshSrajput  \n",
    "**Date:** 2025-06-17  \n",
    "**Description:** Comprehensive exploration of the MovieLens dataset for building hybrid recommendation systems.\n",
    "\n",
    "## Table of Contents\n",
    "1. [Data Loading and Overview](#data-loading)\n",
    "2. [Rating Distribution Analysis](#rating-analysis)\n",
    "3. [User Behavior Patterns](#user-behavior)\n",
    "4. [Movie Popularity Analysis](#movie-analysis)\n",
    "5. [Genre Analysis](#genre-analysis)\n",
    "6. [Temporal Patterns](#temporal-analysis)\n",
    "7. [Sparsity and Cold Start Analysis](#sparsity-analysis)\n",
    "8. [Data Quality Assessment](#data-quality)\n",
    "9. [Insights and Recommendations](#insights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import warnings\n",
    "from datetime import datetime\n",
    "import os\n",
    "import sys\n",
    "\n",
    "# Add src to path\n",
    "sys.path.append('../src')\n",
    "from data_preprocessing import DataPreprocessor\n",
    "from utils import calculate_sparsity\n",
    "\n",
    "# Configuration\n",
    "warnings.filterwarnings('ignore')\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette('husl')\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', 100)\n",
    "\n",
    "print(f\"📊 Data Exploration Notebook\")\n",
    "print(f\"👤 Author: DevanshSrajput\")\n",
    "print(f\"📅 Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
    "print(\"=\"*60)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Data Loading and Overview {#data-loading}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and preprocess data\n",
    "print(\"📂 Loading MovieLens dataset...\")\n",
    "preprocessor = DataPreprocessor()\n",
    "data = preprocessor.preprocess_all('100k')\n",
    "\n",
    "# Extract datasets\n",
    "ratings_df = data['all_ratings']\n",
    "movies_df = data['movies']\n",
    "train_ratings = data['train_ratings']\n",
    "test_ratings = data['test_ratings']\n",
    "user_item_matrix = data['user_item_matrix']\n",
    "\n",
    "print(\"✅ Data loaded successfully!\")\n",
    "print(f\"📊 Dataset Statistics:\")\n",
    "print(f\"   • Total users: {ratings_df['user_id'].nunique():,}\")\n",
    "print(f\"   • Total movies: {ratings_df['item_id'].nunique():,}\")\n",
    "print(f\"   • Total ratings: {len(ratings_df):,}\")\n",
    "print(f\"   • Rating range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}\")\n",
    "print(f\"   • Average rating: {ratings_df['rating'].mean():.2f}\")\n",
    "print(f\"   • Data sparsity: {calculate_sparsity(user_item_matrix.values):.2%}\")\n",
    "print(f\"   • Train/Test split: {len(train_ratings):,} / {len(test_ratings):,}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display sample data\n",
    "print(\"📋 Sample Ratings Data:\")\n",
    "display(ratings_df.head(10))\n",
    "\n",
    "print(\"\\n🎬 Sample Movies Data:\")\n",
    "display(movies_df.head(10))\n",
    "\n",
    "print(\"\\n📊 Data Types and Info:\")\n",
    "print(ratings_df.info())\n",
    "print(\"\\n\")\n",
    "print(movies_df.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Rating Distribution Analysis {#rating-analysis}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rating distribution analysis\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "fig.suptitle('📊 Rating Distribution Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Histogram of ratings\n",
    "axes[0,0].hist(ratings_df['rating'], bins=5, alpha=0.7, color='skyblue', edgecolor='black')\n",
    "axes[0,0].set_title('Rating Distribution (Histogram)')\n",
    "axes[0,0].set_xlabel('Rating')\n",
    "axes[0,0].set_ylabel('Frequency')\n",
    "axes[0,0].grid(True, alpha=0.3)\n",
    "\n",
    "# Bar plot of rating percentages\n",
    "rating_counts = ratings_df['rating'].value_counts().sort_index()\n",
    "rating_percentages = (rating_counts / len(ratings_df)) * 100\n",
    "axes[0,1].bar(rating_percentages.index, rating_percentages.values, alpha=0.7, color='lightcoral')\n",
    "axes[0,1].set_title('Rating Distribution (Percentage)')\n",
    "axes[0,1].set_xlabel('Rating')\n",
    "axes[0,1].set_ylabel('Percentage (%)')\n",
    "axes[0,1].grid(True, alpha=0.3)\n",
    "\n",
    "# Add percentage labels on bars\n",
    "for i, (rating, pct) in enumerate(zip(rating_percentages.index, rating_percentages.values)):\n",
    "    axes[0,1].text(rating, pct + 0.5, f'{pct:.1f}%', ha='center', va='bottom')\n",
    "\n",
    "# Box plot\n",
    "axes[1,0].boxplot(ratings_df['rating'], vert=True, patch_artist=True,\n",
    "                  boxprops=dict(facecolor='lightgreen', alpha=0.7))\n",
    "axes[1,0].set_title('Rating Distribution (Box Plot)')\n",
    "axes[1,0].set_ylabel('Rating')\n",
    "axes[1,0].grid(True, alpha=0.3)\n",
    "\n",
    "# Cumulative distribution\n",
    "sorted_ratings = np.sort(ratings_df['rating'])\n",
    "cumulative_prob = np.arange(1, len(sorted_ratings) + 1) / len(sorted_ratings)\n",
    "axes[1,1].plot(sorted_ratings, cumulative_prob, marker='o', markersize=2, alpha=0.7)\n",
    "axes[1,1].set_title('Cumulative Distribution Function')\n",
    "axes[1,1].set_xlabel('Rating')\n",
    "axes[1,1].set_ylabel('Cumulative Probability')\n",
    "axes[1,1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Statistical summary\n",
    "print(\"📈 Rating Statistics:\")\n",
    "rating_stats = ratings_df['rating'].describe()\n",
    "for stat, value in rating_stats.items():\n",
    "    print(f\"   • {stat.capitalize()}: {value:.3f}\")\n",
    "\n",
    "print(f\"\\n📊 Rating Distribution:\")\n",
    "for rating, count in rating_counts.items():\n",
    "    percentage = (count / len(ratings_df)) * 100\n",
    "    print(f\"   • {rating} stars: {count:,} ratings ({percentage:.1f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. User Behavior Patterns {#user-behavior}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# User behavior analysis\n",
    "user_stats = ratings_df.groupby('user_id').agg({\n",
    "    'rating': ['count', 'mean', 'std', 'min', 'max']\n",
    "}).round(3)\n",
    "user_stats.columns = ['num_ratings', 'avg_rating', 'rating_std', 'min_rating', 'max_rating']\n",
    "user_stats['rating_std'] = user_stats['rating_std'].fillna(0)\n",
    "\n",
    "# User activity categorization\n",
    "user_stats['activity_level'] = pd.cut(\n",
    "    user_stats['num_ratings'], \n",
    "    bins=[0, 5, 20, 50, float('inf')], \n",
    "    labels=['Low (≤5)', 'Medium (6-20)', 'High (21-50)', 'Very High (>50)']\n",
    ")\n",
    "\n",
    "print(\"👥 User Activity Analysis:\")\n",
    "activity_dist = user_stats['activity_level'].value_counts()\n",
    "for level, count in activity_dist.items():\n",
    "    percentage = (count / len(user_stats)) * 100\n",
    "    print(f\"   • {level}: {count:,} users ({percentage:.1f}%)\")\n",
    "\n",
    "# Visualizations\n",
    "fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n",
    "fig.suptitle('👥 User Behavior Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Number of ratings per user\n",
    "axes[0,0].hist(user_stats['num_ratings'], bins=50, alpha=0.7, color='lightblue', edgecolor='black')\n",
    "axes[0,0].axvline(user_stats['num_ratings'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {user_stats[\"num_ratings\"].mean():.1f}')\n",
    "axes[0,0].axvline(user_stats['num_ratings'].median(), color='orange', linestyle='--', linewidth=2,\n",
    "                  label=f'Median: {user_stats[\"num_ratings\"].median():.1f}')\n",
    "axes[0,0].set_title('Distribution of Ratings per User')\n",
    "axes[0,0].set_xlabel('Number of Ratings')\n",
    "axes[0,0].set_ylabel('Number of Users')\n",
    "axes[0,0].legend()\n",
    "axes[0,0].grid(True, alpha=0.3)\n",
    "\n",
    "# Average rating per user\n",
    "axes[0,1].hist(user_stats['avg_rating'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')\n",
    "axes[0,1].axvline(user_stats['avg_rating'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {user_stats[\"avg_rating\"].mean():.2f}')\n",
    "axes[0,1].set_title('Distribution of Average User Ratings')\n",
    "axes[0,1].set_xlabel('Average Rating')\n",
    "axes[0,1].set_ylabel('Number of Users')\n",
    "axes[0,1].legend()\n",
    "axes[0,1].grid(True, alpha=0.3)\n",
    "\n",
    "# Rating standard deviation per user\n",
    "axes[0,2].hist(user_stats['rating_std'], bins=30, alpha=0.7, color='lightcoral', edgecolor='black')\n",
    "axes[0,2].axvline(user_stats['rating_std'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {user_stats[\"rating_std\"].mean():.2f}')\n",
    "axes[0,2].set_title('User Rating Variability')\n",
    "axes[0,2].set_xlabel('Rating Standard Deviation')\n",
    "axes[0,2].set_ylabel('Number of Users')\n",
    "axes[0,2].legend()\n",
    "axes[0,2].grid(True, alpha=0.3)\n",
    "\n",
    "# Activity level pie chart\n",
    "axes[1,0].pie(activity_dist.values, labels=activity_dist.index, autopct='%1.1f%%', startangle=90)\n",
    "axes[1,0].set_title('User Activity Levels')\n",
    "\n",
    "# Correlation: activity vs average rating\n",
    "axes[1,1].scatter(user_stats['num_ratings'], user_stats['avg_rating'], alpha=0.6, s=20)\n",
    "axes[1,1].set_title('User Activity vs Average Rating')\n",
    "axes[1,1].set_xlabel('Number of Ratings')\n",
    "axes[1,1].set_ylabel('Average Rating')\n",
    "axes[1,1].grid(True, alpha=0.3)\n",
    "\n",
    "# Correlation coefficient\n",
    "correlation = user_stats['num_ratings'].corr(user_stats['avg_rating'])\n",
    "axes[1,1].text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=axes[1,1].transAxes,\n",
    "               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))\n",
    "\n",
    "# Rating range per user\n",
    "user_stats['rating_range'] = user_stats['max_rating'] - user_stats['min_rating']\n",
    "axes[1,2].hist(user_stats['rating_range'], bins=6, alpha=0.7, color='gold', edgecolor='black')\n",
    "axes[1,2].set_title('User Rating Range Distribution')\n",
    "axes[1,2].set_xlabel('Rating Range (Max - Min)')\n",
    "axes[1,2].set_ylabel('Number of Users')\n",
    "axes[1,2].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"\\n📊 User Behavior Statistics:\")\n",
    "print(user_stats.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Movie Popularity Analysis {#movie-analysis}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Movie popularity analysis\n",
    "movie_stats = ratings_df.groupby('item_id').agg({\n",
    "    'rating': ['count', 'mean', 'std', 'min', 'max']\n",
    "}).round(3)\n",
    "movie_stats.columns = ['num_ratings', 'avg_rating', 'rating_std', 'min_rating', 'max_rating']\n",
    "movie_stats['rating_std'] = movie_stats['rating_std'].fillna(0)\n",
    "\n",
    "# Merge with movie information\n",
    "movie_analysis = movie_stats.merge(\n",
    "    movies_df[['item_id', 'title', 'genres', 'year']], \n",
    "    left_index=True, right_on='item_id', how='left'\n",
    ")\n",
    "\n",
    "# Movie popularity categorization\n",
    "movie_analysis['popularity_level'] = pd.cut(\n",
    "    movie_analysis['num_ratings'], \n",
    "    bins=[0, 10, 50, 200, float('inf')], \n",
    "    labels=['Low (≤10)', 'Medium (11-50)', 'High (51-200)', 'Very High (>200)']\n",
    ")\n",
    "\n",
    "print(\"🎬 Movie Popularity Analysis:\")\n",
    "popularity_dist = movie_analysis['popularity_level'].value_counts()\n",
    "for level, count in popularity_dist.items():\n",
    "    percentage = (count / len(movie_analysis)) * 100\n",
    "    print(f\"   • {level}: {count:,} movies ({percentage:.1f}%)\")\n",
    "\n",
    "# Visualizations\n",
    "fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n",
    "fig.suptitle('🎬 Movie Popularity Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Number of ratings per movie\n",
    "axes[0,0].hist(movie_stats['num_ratings'], bins=50, alpha=0.7, color='lightblue', edgecolor='black')\n",
    "axes[0,0].axvline(movie_stats['num_ratings'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {movie_stats[\"num_ratings\"].mean():.1f}')\n",
    "axes[0,0].axvline(movie_stats['num_ratings'].median(), color='orange', linestyle='--', linewidth=2,\n",
    "                  label=f'Median: {movie_stats[\"num_ratings\"].median():.1f}')\n",
    "axes[0,0].set_title('Distribution of Ratings per Movie')\n",
    "axes[0,0].set_xlabel('Number of Ratings')\n",
    "axes[0,0].set_ylabel('Number of Movies')\n",
    "axes[0,0].legend()\n",
    "axes[0,0].grid(True, alpha=0.3)\n",
    "axes[0,0].set_xlim(0, 500)  # Focus on main distribution\n",
    "\n",
    "# Average rating per movie\n",
    "axes[0,1].hist(movie_stats['avg_rating'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')\n",
    "axes[0,1].axvline(movie_stats['avg_rating'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {movie_stats[\"avg_rating\"].mean():.2f}')\n",
    "axes[0,1].set_title('Distribution of Average Movie Ratings')\n",
    "axes[0,1].set_xlabel('Average Rating')\n",
    "axes[0,1].set_ylabel('Number of Movies')\n",
    "axes[0,1].legend()\n",
    "axes[0,1].grid(True, alpha=0.3)\n",
    "\n",
    "# Popularity vs Rating scatter plot\n",
    "axes[0,2].scatter(movie_stats['num_ratings'], movie_stats['avg_rating'], alpha=0.6, s=20, color='purple')\n",
    "axes[0,2].set_title('Movie Popularity vs Average Rating')\n",
    "axes[0,2].set_xlabel('Number of Ratings (Popularity)')\n",
    "axes[0,2].set_ylabel('Average Rating')\n",
    "axes[0,2].grid(True, alpha=0.3)\n",
    "axes[0,2].set_xlim(0, 1000)  # Focus on main distribution\n",
    "\n",
    "# Correlation coefficient\n",
    "correlation = movie_stats['num_ratings'].corr(movie_stats['avg_rating'])\n",
    "axes[0,2].text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=axes[0,2].transAxes,\n",
    "               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))\n",
    "\n",
    "# Popularity level pie chart\n",
    "axes[1,0].pie(popularity_dist.values, labels=popularity_dist.index, autopct='%1.1f%%', startangle=90)\n",
    "axes[1,0].set_title('Movie Popularity Levels')\n",
    "\n",
    "# Rating standard deviation distribution\n",
    "axes[1,1].hist(movie_stats['rating_std'], bins=30, alpha=0.7, color='orange', edgecolor='black')\n",
    "axes[1,1].axvline(movie_stats['rating_std'].mean(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {movie_stats[\"rating_std\"].mean():.2f}')\n",
    "axes[1,1].set_title('Movie Rating Variability')\n",
    "axes[1,1].set_xlabel('Rating Standard Deviation')\n",
    "axes[1,1].set_ylabel('Number of Movies')\n",
    "axes[1,1].legend()\n",
    "axes[1,1].grid(True, alpha=0.3)\n",
    "\n",
    "# Box plot of ratings by popularity level\n",
    "popularity_data = [movie_analysis[movie_analysis['popularity_level'] == level]['avg_rating'].dropna() \n",
    "                   for level in popularity_dist.index if len(movie_analysis[movie_analysis['popularity_level'] == level]) > 0]\n",
    "box_labels = [level for level in popularity_dist.index if len(movie_analysis[movie_analysis['popularity_level'] == level]) > 0]\n",
    "\n",
    "if popularity_data:\n",
    "    axes[1,2].boxplot(popularity_data, labels=box_labels, patch_artist=True,\n",
    "                      boxprops=dict(facecolor='lightcoral', alpha=0.7))\n",
    "    axes[1,2].set_title('Rating Distribution by Popularity Level')\n",
    "    axes[1,2].set_xlabel('Popularity Level')\n",
    "    axes[1,2].set_ylabel('Average Rating')\n",
    "    axes[1,2].tick_params(axis='x', rotation=45)\n",
    "    axes[1,2].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Top movies analysis\n",
    "print(\"\\n🏆 Top 10 Most Popular Movies:\")\n",
    "top_popular = movie_analysis.nlargest(10, 'num_ratings')[['title', 'num_ratings', 'avg_rating', 'genres']]\n",
    "for i, (_, movie) in enumerate(top_popular.iterrows(), 1):\n",
    "    print(f\"   {i:2d}. {movie['title'][:50]}... - {movie['num_ratings']:,} ratings (avg: {movie['avg_rating']:.2f})\")\n",
    "\n",
    "print(\"\\n⭐ Top 10 Highest Rated Movies (min 50 ratings):\")\n",
    "top_rated = movie_analysis[movie_analysis['num_ratings'] >= 50].nlargest(10, 'avg_rating')[['title', 'num_ratings', 'avg_rating', 'genres']]\n",
    "for i, (_, movie) in enumerate(top_rated.iterrows(), 1):\n",
    "    print(f\"   {i:2d}. {movie['title'][:50]}... - {movie['avg_rating']:.2f}/5 ({movie['num_ratings']:,} ratings)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Genre Analysis {#genre-analysis}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Genre analysis\n",
    "if 'genres' in movies_df.columns:\n",
    "    # Extract all genres and their ratings\n",
    "    genre_ratings_data = []\n",
    "    \n",
    "    for _, movie in movies_df.iterrows():\n",
    "        if pd.notna(movie['genres']):\n",
    "            genres = str(movie['genres']).split('|')\n",
    "            movie_ratings = ratings_df[ratings_df['item_id'] == movie['item_id']]\n",
    "            \n",
    "            for genre in genres:\n",
    "                genre = genre.strip()\n",
    "                if genre:  # Skip empty genres\n",
    "                    for _, rating_row in movie_ratings.iterrows():\n",
    "                        genre_ratings_data.append({\n",
    "                            'genre': genre,\n",
    "                            'rating': rating_row['rating'],\n",
    "                            'user_id': rating_row['user_id'],\n",
    "                            'item_id': movie['item_id']\n",
    "                        })\n",
    "    \n",
    "    genre_df = pd.DataFrame(genre_ratings_data)\n",
    "    \n",
    "    # Genre statistics\n",
    "    genre_stats = genre_df.groupby('genre').agg({\n",
    "        'rating': ['count', 'mean', 'std'],\n",
    "        'item_id': 'nunique',\n",
    "        'user_id': 'nunique'\n",
    "    }).round(3)\n",
    "    genre_stats.columns = ['num_ratings', 'avg_rating', 'rating_std', 'num_movies', 'num_users']\n",
    "    genre_stats['rating_std'] = genre_stats['rating_std'].fillna(0)\n",
    "    genre_stats = genre_stats.sort_values('num_ratings', ascending=False)\n",
    "    \n",
    "    print(\"🎭 Genre Analysis:\")\n",
    "    print(f\"   • Total unique genres: {len(genre_stats)}\")\n",
    "    print(f\"   • Most popular genre: {genre_stats.index[0]} ({genre_stats.iloc[0]['num_ratings']:,} ratings)\")\n",
    "    print(f\"   • Highest rated genre: {genre_stats.loc[genre_stats['avg_rating'].idxmax()].name} ({genre_stats['avg_rating'].max():.2f}/5)\")\n",
    "    \n",
    "    # Visualizations\n",
    "    fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
    "    fig.suptitle('🎭 Genre Analysis', fontsize=16, fontweight='bold')\n",
    "    \n",
    "    # Top 15 genres by popularity\n",
    "    top_15_genres = genre_stats.head(15)\n",
    "    axes[0,0].barh(range(len(top_15_genres)), top_15_genres['num_ratings'], color='lightblue', alpha=0.8)\n",
    "    axes[0,0].set_yticks(range(len(top_15_genres)))\n",
    "    axes[0,0].set_yticklabels(top_15_genres.index)\n",
    "    axes[0,0].set_title('Top 15 Genres by Number of Ratings')\n",
    "    axes[0,0].set_xlabel('Number of Ratings')\n",
    "    axes[0,0].invert_yaxis()\n",
    "    axes[0,0].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Top 15 genres by average rating (minimum 1000 ratings)\n",
    "    top_rated_genres = genre_stats[genre_stats['num_ratings'] >= 1000].nlargest(15, 'avg_rating')\n",
    "    if not top_rated_genres.empty:\n",
    "        axes[0,1].barh(range(len(top_rated_genres)), top_rated_genres['avg_rating'], color='lightgreen', alpha=0.8)\n",
    "        axes[0,1].set_yticks(range(len(top_rated_genres)))\n",
    "        axes[0,1].set_yticklabels(top_rated_genres.index)\n",
    "        axes[0,1].set_title('Top 15 Highest Rated Genres (min 1000 ratings)')\n",
    "        axes[0,1].set_xlabel('Average Rating')\n",
    "        axes[0,1].invert_yaxis()\n",
    "        axes[0,1].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Genre diversity (number of movies per genre)\n",
    "    axes[0,2].scatter(genre_stats['num_movies'], genre_stats['avg_rating'], \n",
    "                      s=genre_stats['num_ratings']/100, alpha=0.6, color='purple')\n",
    "    axes[0,2].set_title('Genre Diversity vs Quality')\n",
    "    axes[0,2].set_xlabel('Number of Movies in Genre')\n",
    "    axes[0,2].set_ylabel('Average Rating')\n",
    "    axes[0,2].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Add size legend\n",
    "    axes[0,2].text(0.05, 0.95, 'Bubble size = # ratings', transform=axes[0,2].transAxes,\n",
    "                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))\n",
    "    \n",
    "    # Rating distribution for top 6 genres\n",
    "    top_6_genres = genre_stats.head(6).index\n",
    "    colors = plt.cm.Set3(np.linspace(0, 1, len(top_6_genres)))\n",
    "    \n",
    "    for i, genre in enumerate(top_6_genres):\n",
    "        genre_ratings = genre_df[genre_df['genre'] == genre]['rating']\n",
    "        axes[1,0].hist(genre_ratings, alpha=0.6, label=genre, bins=5, color=colors[i])\n",
    "    \n",
    "    axes[1,0].set_title('Rating Distribution by Top 6 Genres')\n",
    "    axes[1,0].set_xlabel('Rating')\n",
    "    axes[1,0].set_ylabel('Frequency')\n",
    "    axes[1,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n",
    "    axes[1,0].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Box plot of ratings by top 8 genres\n",
    "    top_8_genres = genre_stats.head(8).index\n",
    "    genre_box_data = [genre_df[genre_df['genre'] == genre]['rating'] for genre in top_8_genres]\n",
    "    \n",
    "    box_plot = axes[1,1].boxplot(genre_box_data, labels=top_8_genres, patch_artist=True)\n",
    "    \n",
    "    # Color the boxes\n",
    "    colors = plt.cm.Set3(np.linspace(0, 1, len(box_plot['boxes'])))\n",
    "    for patch, color in zip(box_plot['boxes'], colors):\n",
    "        patch.set_facecolor(color)\n",
    "        patch.set_alpha(0.7)\n",
    "    \n",
    "    axes[1,1].set_title('Rating Distribution by Top 8 Genres')\n",
    "    axes[1,1].set_xlabel('Genre')\n",
    "    axes[1,1].set_ylabel('Rating')\n",
    "    axes[1,1].tick_params(axis='x', rotation=45)\n",
    "    axes[1,1].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Genre popularity over time (if year data is available)\n",
    "    if 'year' in movies_df.columns:\n",
    "        # Create year-genre data\n",
    "        year_genre_data = []\n",
    "        for _, movie in movies_df.iterrows():\n",
    "            if pd.notna(movie['genres']) and pd.notna(movie['year']):\n",
    "                genres = str(movie['genres']).split('|')\n",
    "                year = movie['year']\n",
    "                if 1980 <= year <= 2000:  # Focus on specific range\n",
    "                    for genre in genres:\n",
    "                        genre = genre.strip()\n",
    "                        if genre in top_6_genres:\n",
    "                            year_genre_data.append({'year': year, 'genre': genre})\n",
    "        \n",
    "        if year_genre_data:\n",
    "            year_genre_df = pd.DataFrame(year_genre_data)\n",
    "            year_genre_counts = year_genre_df.groupby(['year', 'genre']).size().unstack(fill_value=0)\n",
    "            \n",
    "            for genre in year_genre_counts.columns:\n",
    "                axes[1,2].plot(year_genre_counts.index, year_genre_counts[genre], \n",
    "                              marker='o', label=genre, linewidth=2, markersize=4)\n",
    "            \n",
    "            axes[1,2].set_title('Genre Popularity Trends Over Time')\n",
    "            axes[1,2].set_xlabel('Year')\n",
    "            axes[1,2].set_ylabel('Number of Movies')\n",
    "            axes[1,2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n",
    "            axes[1,2].grid(True, alpha=0.3)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    # Genre statistics table\n",
    "    print(\"\\n📊 Top 15 Genre Statistics:\")\n",
    "    display(genre_stats.head(15))\n",
    "    \n",
    "else:\n",
    "    print(\"⚠️ Genre information not available in the dataset.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Temporal Patterns {#temporal-analysis}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Temporal analysis (if timestamp is available)\n",
    "if 'timestamp' in ratings_df.columns:\n",
    "    print(\"📅 Temporal Analysis:\")\n",
    "    \n",
    "    # Convert timestamp to datetime\n",
    "    ratings_df['datetime'] = pd.to_datetime(ratings_df['timestamp'], unit='s')\n",
    "    ratings_df['date'] = ratings_df['datetime'].dt.date\n",
    "    ratings_df['hour'] = ratings_df['datetime'].dt.hour\n",
    "    ratings_df['day_of_week'] = ratings_df['datetime'].dt.day_name()\n",
    "    ratings_df['month'] = ratings_df['datetime'].dt.month\n",
    "    ratings_df['year'] = ratings_df['datetime'].dt.year\n",
    "    \n",
    "    # Time range\n",
    "    print(f\"   • Time range: {ratings_df['datetime'].min()} to {ratings_df['datetime'].max()}\")\n",
    "    print(f\"   • Duration: {(ratings_df['datetime'].max() - ratings_df['datetime'].min()).days} days\")\n",
    "    \n",
    "    # Visualizations\n",
    "    fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
    "    fig.suptitle('📅 Temporal Patterns Analysis', fontsize=16, fontweight='bold')\n",
    "    \n",
    "    # Ratings over time (daily)\n",
    "    daily_ratings = ratings_df.groupby('date').size()\n",
    "    axes[0,0].plot(daily_ratings.index, daily_ratings.values, alpha=0.7, linewidth=1)\n",
    "    axes[0,0].set_title('Daily Rating Activity')\n",
    "    axes[0,0].set_xlabel('Date')\n",
    "    axes[0,0].set_ylabel('Number of Ratings')\n",
    "    axes[0,0].tick_params(axis='x', rotation=45)\n",
    "    axes[0,0].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Add trend line\n",
    "    z = np.polyfit(range(len(daily_ratings)), daily_ratings.values, 1)\n",
    "    p = np.poly1d(z)\n",
    "    axes[0,0].plot(daily_ratings.index, p(range(len(daily_ratings))), \"r--\", alpha=0.8, linewidth=2, label='Trend')\n",
    "    axes[0,0].legend()\n",
    "    \n",
    "    # Hourly distribution\n",
    "    hourly_ratings = ratings_df.groupby('hour').size()\n",
    "    axes[0,1].bar(hourly_ratings.index, hourly_ratings.values, alpha=0.7, color='orange')\n",
    "    axes[0,1].set_title('Hourly Rating Distribution')\n",
    "    axes[0,1].set_xlabel('Hour of Day')\n",
    "    axes[0,1].set_ylabel('Number of Ratings')\n",
    "    axes[0,1].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Add peak hour annotation\n",
    "    peak_hour = hourly_ratings.idxmax()\n",
    "    peak_count = hourly_ratings.max()\n",
    "    axes[0,1].annotate(f'Peak: {peak_hour}:00\\n({peak_count:,} ratings)', \n",
    "                       xy=(peak_hour, peak_count), xytext=(peak_hour+2, peak_count+1000),\n",
    "                       arrowprops=dict(arrowstyle='->', color='red'),\n",
    "                       bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.7))\n",
    "    \n",
    "    # Day of week distribution\n",
    "    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "    weekly_ratings = ratings_df.groupby('day_of_week').size().reindex(day_order)\n",
    "    \n",
    "    colors = ['red' if day in ['Saturday', 'Sunday'] else 'skyblue' for day in day_order]\n",
    "    axes[0,2].bar(range(len(weekly_ratings)), weekly_ratings.values, color=colors, alpha=0.7)\n",
    "    axes[0,2].set_xticks(range(len(weekly_ratings)))\n",
    "    axes[0,2].set_xticklabels([day[:3] for day in day_order])\n",
    "    axes[0,2].set_title('Weekly Rating Distribution')\n",
    "    axes[0,2].set_xlabel('Day of Week')\n",
    "    axes[0,2].set_ylabel('Number of Ratings')\n",
    "    axes[0,2].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Average rating over time\n",
    "    daily_avg_rating = ratings_df.groupby('date')['rating'].mean()\n",
    "    axes[1,0].plot(daily_avg_rating.index, daily_avg_rating.values, alpha=0.7, color='green', linewidth=1)\n",
    "    axes[1,0].axhline(y=ratings_df['rating'].mean(), color='red', linestyle='--', alpha=0.8, \n",
    "                      label=f'Overall Avg: {ratings_df[\"rating\"].mean():.2f}')\n",
    "    axes[1,0].set_title('Average Rating Over Time')\n",
    "    axes[1,0].set_xlabel('Date')\n",
    "    axes[1,0].set_ylabel('Average Rating')\n",
    "    axes[1,0].tick_params(axis='x', rotation=45)\n",
    "    axes[1,0].legend()\n",
    "    axes[1,0].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Monthly distribution\n",
    "    monthly_ratings = ratings_df.groupby('month').size()\n",
    "    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', \n",
    "                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n",
    "    axes[1,1].bar(monthly_ratings.index, monthly_ratings.values, alpha=0.7, color='purple')\n",
    "    axes[1,1].set_xticks(range(1, 13))\n",
    "    axes[1,1].set_xticklabels(month_names)\n",
    "    axes[1,1].set_title('Monthly Rating Distribution')\n",
    "    axes[1,1].set_xlabel('Month')\n",
    "    axes[1,1].set_ylabel('Number of Ratings')\n",
    "    axes[1,1].grid(True, alpha=0.3)\n",
    "    \n",
    "    # Heatmap: Hour vs Day of Week\n",
    "    hourly_weekly = ratings_df.groupby(['day_of_week', 'hour']).size().unstack(fill_value=0)\n",
    "    hourly_weekly = hourly_weekly.reindex(day_order)\n",
    "    \n",
    "    im = axes[1,2].imshow(hourly_weekly.values, cmap='YlOrRd', aspect='auto')\n",
    "    axes[1,2].set_title('Rating Activity Heatmap')\n",
    "    axes[1,2].set_xlabel('Hour of Day')\n",
    "    axes[1,2].set_ylabel('Day of Week')\n",
    "    axes[1,2].set_xticks(range(0, 24, 4))\n",
    "    axes[1,2].set_xticklabels(range(0, 24, 4))\n",
    "    axes[1,2].set_yticks(range(len(day_order)))\n",
    "    axes[1,2].set_yticklabels([day[:3] for day in day_order])\n",
    "    \n",
    "    # Add colorbar\n",
    "    plt.colorbar(im, ax=axes[1,2], label='Number of Ratings')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    # Temporal statistics\n",
    "    print(\"\\n📊 Temporal Statistics:\")\n",
    "    print(f\"   • Most active day: {weekly_ratings.idxmax()} ({weekly_ratings.max():,} ratings)\")\n",
    "    print(f\"   • Most active hour: {peak_hour}:00 ({peak_count:,} ratings)\")\n",
    "    print(f\"   • Most active month: {month_names[monthly_ratings.idxmax()-1]} ({monthly_ratings.max():,} ratings)\")\n",
    "    print(f\"   • Weekend vs Weekday ratio: {(weekly_ratings['Saturday'] + weekly_ratings['Sunday']) / weekly_ratings[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']].sum():.2f}\")\n",
    "    \n",
    "else:\n",
    "    print(\"⚠️ Timestamp information not available for temporal analysis.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Sparsity and Cold Start Analysis {#sparsity-analysis}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sparsity and cold start analysis\n",
    "print(\"🕳️ Sparsity and Cold Start Analysis:\")\n",
    "\n",
    "# Overall sparsity\n",
    "total_possible_ratings = user_item_matrix.shape[0] * user_item_matrix.shape[1]\n",
    "actual_ratings = len(ratings_df)\n",
    "sparsity = 1 - (actual_ratings / total_possible_ratings)\n",
    "\n",
    "print(f\"   • Matrix dimensions: {user_item_matrix.shape[0]:,} users × {user_item_matrix.shape[1]:,} movies\")\n",
    "print(f\"   • Total possible ratings: {total_possible_ratings:,}\")\n",
    "print(f\"   • Actual ratings: {actual_ratings:,}\")\n",
    "print(f\"   • Sparsity level: {sparsity:.4f} ({sparsity*100:.2f}%)\")\n",
    "print(f\"   • Density level: {1-sparsity:.4f} ({(1-sparsity)*100:.2f}%)\")\n",
    "\n",
    "# Cold start analysis\n",
    "user_rating_counts = ratings_df.groupby('user_id').size()\n",
    "movie_rating_counts = ratings_df.groupby('item_id').size()\n",
    "\n",
    "# User cold start categories\n",
    "cold_start_users = (user_rating_counts <= 5).sum()\n",
    "warm_start_users = ((user_rating_counts > 5) & (user_rating_counts <= 20)).sum()\n",
    "active_users = (user_rating_counts > 20).sum()\n",
    "\n",
    "# Movie cold start categories  \n",
    "cold_start_movies = (movie_rating_counts <= 10).sum()\n",
    "warm_start_movies = ((movie_rating_counts > 10) & (movie_rating_counts <= 50)).sum()\n",
    "popular_movies = (movie_rating_counts > 50).sum()\n",
    "\n",
    "print(f\"\\n👥 User Cold Start Analysis:\")\n",
    "print(f\"   • Cold start users (≤5 ratings): {cold_start_users:,} ({cold_start_users/len(user_rating_counts)*100:.1f}%)\")\n",
    "print(f\"   • Warm start users (6-20 ratings): {warm_start_users:,} ({warm_start_users/len(user_rating_counts)*100:.1f}%)\")\n",
    "print(f\"   • Active users (>20 ratings): {active_users:,} ({active_users/len(user_rating_counts)*100:.1f}%)\")\n",
    "\n",
    "print(f\"\\n🎬 Movie Cold Start Analysis:\")\n",
    "print(f\"   • Cold start movies (≤10 ratings): {cold_start_movies:,} ({cold_start_movies/len(movie_rating_counts)*100:.1f}%)\")\n",
    "print(f\"   • Warm start movies (11-50 ratings): {warm_start_movies:,} ({warm_start_movies/len(movie_rating_counts)*100:.1f}%)\")\n",
    "print(f\"   • Popular movies (>50 ratings): {popular_movies:,} ({popular_movies/len(movie_rating_counts)*100:.1f}%)\")\n",
    "\n",
    "# Visualizations\n",
    "fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n",
    "fig.suptitle('🕳️ Sparsity and Cold Start Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Sparsity visualization (sample)\n",
    "sample_size = 100\n",
    "sample_matrix = user_item_matrix.iloc[:sample_size, :sample_size]\n",
    "im1 = axes[0,0].imshow(sample_matrix, cmap='Blues', aspect='auto')\n",
    "axes[0,0].set_title(f'User-Item Matrix Sample ({sample_size}×{sample_size})')\n",
    "axes[0,0].set_xlabel('Movie ID')\n",
    "axes[0,0].set_ylabel('User ID')\n",
    "plt.colorbar(im1, ax=axes[0,0], label='Rating')\n",
    "\n",
    "# User activity distribution\n",
    "axes[0,1].hist(user_rating_counts, bins=50, alpha=0.7, color='lightblue', edgecolor='black')\n",
    "axes[0,1].axvline(user_rating_counts.median(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Median: {user_rating_counts.median():.0f}')\n",
    "axes[0,1].axvline(user_rating_counts.mean(), color='orange', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {user_rating_counts.mean():.1f}')\n",
    "axes[0,1].set_title('User Activity Distribution')\n",
    "axes[0,1].set_xlabel('Number of Ratings per User')\n",
    "axes[0,1].set_ylabel('Number of Users')\n",
    "axes[0,1].legend()\n",
    "axes[0,1].grid(True, alpha=0.3)\n",
    "\n",
    "# User cold start pie chart\n",
    "user_categories = ['Cold Start\\n(≤5 ratings)', 'Warm Start\\n(6-20 ratings)', 'Active\\n(>20 ratings)']\n",
    "user_counts = [cold_start_users, warm_start_users, active_users]\n",
    "colors_users = ['red', 'orange', 'green']\n",
    "\n",
    "axes[0,2].pie(user_counts, labels=user_categories, colors=colors_users, autopct='%1.1f%%', startangle=90)\n",
    "axes[0,2].set_title('User Activity Categories')\n",
    "\n",
    "# Movie popularity distribution\n",
    "axes[1,0].hist(movie_rating_counts, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')\n",
    "axes[1,0].axvline(movie_rating_counts.median(), color='red', linestyle='--', linewidth=2,\n",
    "                  label=f'Median: {movie_rating_counts.median():.0f}')\n",
    "axes[1,0].axvline(movie_rating_counts.mean(), color='orange', linestyle='--', linewidth=2,\n",
    "                  label=f'Mean: {movie_rating_counts.mean():.1f}')\n",
    "axes[1,0].set_title('Movie Popularity Distribution')\n",
    "axes[1,0].set_xlabel('Number of Ratings per Movie')\n",
    "axes[1,0].set_ylabel('Number of Movies')\n",
    "axes[1,0].legend()\n",
    "axes[1,0].grid(True, alpha=0.3)\n",
    "axes[1,0].set_xlim(0, 500)  # Focus on main distribution\n",
    "\n",
    "# Movie cold start pie chart\n",
    "movie_categories = ['Cold Start\\n(≤10 ratings)', 'Warm Start\\n(11-50 ratings)', 'Popular\\n(>50 ratings)']\n",
    "movie_counts = [cold_start_movies, warm_start_movies, popular_movies]\n",
    "colors_movies = ['red', 'orange', 'green']\n",
    "\n",
    "axes[1,1].pie(movie_counts, labels=movie_categories, colors=colors_movies, autopct='%1.1f%%', startangle=90)\n",
    "axes[1,1].set_title('Movie Popularity Categories')\n",
    "\n",
    "# Sparsity by user activity level\n",
    "user_sparsity_data = []\n",
    "activity_levels = ['Cold Start (≤5)', 'Warm Start (6-20)', 'Active (>20)']\n",
    "thresholds = [(0, 5), (6, 20), (21, float('inf'))]\n",
    "\n",
    "for (min_ratings, max_ratings), level in zip(thresholds, activity_levels):\n",
    "    if max_ratings == float('inf'):\n",
    "        users_in_level = user_rating_counts[user_rating_counts > min_ratings].index\n",
    "    else:\n",
    "        users_in_level = user_rating_counts[\n",
    "            (user_rating_counts >= min_ratings) & (user_rating_counts <= max_ratings)\n",
    "        ].index\n",
    "    \n",
    "    if len(users_in_level) > 0:\n",
    "        user_subset = user_item_matrix.loc[users_in_level]\n",
    "        subset_sparsity = calculate_sparsity(user_subset.values)\n",
    "        user_sparsity_data.append(subset_sparsity)\n",
    "    else:\n",
    "        user_sparsity_data.append(0)\n",
    "\n",
    "axes[1,2].bar(activity_levels, user_sparsity_data, color=['red', 'orange', 'green'], alpha=0.7)\n",
    "axes[1,2].set_title('Sparsity by User Activity Level')\n",
    "axes[1,2].set_xlabel('User Activity Level')\n",
    "axes[1,2].set_ylabel('Sparsity Level')\n",
    "axes[1,2].tick_params(axis='x', rotation=45)\n",
    "axes[1,2].grid(True, alpha=0.3)\n",
    "\n",
    "# Add percentage labels\n",
    "for i, (level, sparsity) in enumerate(zip(activity_levels, user_sparsity_data)):\n",
    "    axes[1,2].text(i, sparsity + 0.01, f'{sparsity:.1%}', ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Long tail analysis\n",
    "print(f\"\\n📈 Long Tail Analysis:\")\n",
    "print(f\"   • Top 20% of users account for {(user_rating_counts.nlargest(int(len(user_rating_counts)*0.2)).sum() / user_rating_counts.sum())*100:.1f}% of all ratings\")\n",
    "print(f\"   • Top 20% of movies account for {(movie_rating_counts.nlargest(int(len(movie_rating_counts)*0.2)).sum() / movie_rating_counts.sum())*100:.1f}% of all ratings\")\n",
    "print(f\"   • Bottom 50% of users account for {(user_rating_counts.nsmallest(int(len(user_rating_counts)*0.5)).sum() / user_rating_counts.sum())*100:.1f}% of all ratings\")\n",
    "print(f\"   • Bottom 50% of movies account for {(movie_rating_counts.nsmallest(int(len(movie_rating_counts)*0.5)).sum() / movie_rating_counts.sum())*100:.1f}% of all ratings\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Data Quality Assessment {#data-quality}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data quality assessment\n",
    "print(\"🔍 Data Quality Assessment:\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "# Missing values analysis\n",
    "print(\"\\n📋 Missing Values Analysis:\")\n",
    "print(\"\\nRatings Dataset:\")\n",
    "ratings_missing = ratings_df.isnull().sum()\n",