In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movie Recommendation System - Exploratory Data Analysis\n",
    "\n",
    "This notebook explores the TMDB movie dataset and demonstrates the recommendation algorithms."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "from src.utils.data_loader import load_data, merge_datasets\n",
    "from src.utils.preprocessor import preprocess_movies, create_soup\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Load datasets\n",
    "movies, credits = load_data('../data/raw/')\n",
    "\n",
    "print(f\"Movies shape: {movies.shape}\")\n",
    "print(f\"Credits shape: {credits.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display first few rows\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Data Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Basic statistics\n",
    "movies.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Check missing values\n",
    "missing = movies.isnull().sum()\n",
    "missing[missing > 0].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Rating distribution\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.hist(movies['vote_average'], bins=30, edgecolor='black')\n",
    "plt.xlabel('Rating')\n",
    "plt.ylabel('Count')\n",
    "plt.title('Distribution of Movie Ratings')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.hist(np.log1p(movies['vote_count']), bins=30, edgecolor='black')\n",
    "plt.xlabel('Log(Vote Count + 1)')\n",
    "plt.ylabel('Count')\n",
    "plt.title('Distribution of Vote Counts (Log Scale)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Merge and Preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Merge datasets\n",
    "merged_df = merge_datasets(movies, credits)\n",
    "\n",
    "# Preprocess\n",
    "processed_df = preprocess_movies(merged_df)\n",
    "processed_df = create_soup(processed_df)\n",
    "\n",
    "print(f\"Processed shape: {processed_df.shape}\")\n",
    "processed_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Train Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from src.models.content_based import ContentBasedRecommender\n",
    "from src.models.collaborative import CollaborativeRecommender\n",
    "\n",
    "# Train content-based model\n",
    "content_model = ContentBasedRecommender()\n",
    "content_model.fit(processed_df, soup_column='soup')\n",
    "\n",
    "# Train collaborative model\n",
    "collab_model = CollaborativeRecommender(n_neighbors=10)\n",
    "collab_model.fit(processed_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Test Recommendations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Test movie\n",
    "test_movie = 'The Dark Knight'\n",
    "\n",
    "print(f\"Recommendations for: {test_movie}\")\n",
    "print(\"\\nContent-Based:\")\n",
    "content_recs = content_model.get_recommendations(test_movie, n=10)\n",
    "for i, (title, score) in enumerate(content_recs, 1):\n",
    "    print(f\"{i:2d}. {title:<50} ({score:.3f})\")\n",
    "\n",
    "print(\"\\nCollaborative:\")\n",
    "collab_recs = collab_model.get_recommendations(test_movie, n=10)\n",
    "for i, (title, score) in enumerate(collab_recs, 1):\n",
    "    print(f\"{i:2d}. {title:<50} ({score:.3f})\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}