In [8]:
# 1. Problem Definition
# Build a system to recommend movies based on genre using clustering and content-based filtering.

In [9]:
# 2. Data Collection
import pandas as pd
import ast
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [10]:
# Load dataset
df = pd.read_csv('tmdb_5000_movies.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'tmdb_5000_movies.csv'

In [None]:
# 3. Data Cleaning
df = df[['title', 'genres']]
df['genres'] = df['genres'].apply(lambda x: [g['name'] for g in ast.literal_eval(x)] if pd.notnull(x) else [])


In [None]:
# Drop rows with no genre
df = df[df['genres'].map(len) > 0]

In [None]:
# 4. Exploratory Data Analysis (EDA)
# Flatten list of all genres for frequency plot
all_genres = [genre for sublist in df['genres'] for genre in sublist]
genre_freq = pd.Series(all_genres).value_counts()

In [None]:
# Plot genre distribution
plt.figure(figsize=(10,5))
sns.barplot(x=genre_freq.index, y=genre_freq.values, palette="viridis")
plt.title('Genre Frequency Distribution')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Count number of genres per movie
df['genre_count'] = df['genres'].apply(len)
plt.figure(figsize=(6,4))
sns.histplot(df['genre_count'], bins=range(1, df['genre_count'].max()+2), kde=True, color='teal')
plt.title("Number of Genres per Movie")
plt.xlabel("Genre Count")
plt.ylabel("Number of Movies")
plt.show()

In [None]:
# 5. Feature Engineering
# Join genres as a single string
df['genre_str'] = df['genres'].apply(lambda x: " ".join(x))

In [None]:
# Vectorize genre string
vectorizer = CountVectorizer()
genre_matrix = vectorizer.fit_transform(df['genre_str'])

In [None]:
# 6. Model Building - KMeans Clustering
k = 10  # Number of clusters (adjust as needed)
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(genre_matrix)

In [None]:
# 7. Model Evaluation
# Show sample of movies from one cluster
print("📌 Sample movies from cluster 0:")
print(df[df['cluster'] == 0][['title', 'genre_str']].head(10))

In [None]:
# 8. Deployment Preparation
# Combine clustering with genre-based filtering for smarter recommendations

class MovieGenreRecommender:
    def __init__(self, df, vectorizer, kmeans):
        self.df = df
        self.vectorizer = vectorizer
        self.kmeans = kmeans

    def recommend_by_genre(self, genre, n=10):
        genre = genre.lower()
        return self.df[self.df['genre_str'].str.lower().str.contains(genre)]['title'].head(n).tolist()

    def recommend_by_cluster(self, genre, n=10):
        test_vec = self.vectorizer.transform([genre])
        cluster = self.kmeans.predict(test_vec)[0]
        return self.df[self.df['cluster'] == cluster]['title'].head(n).tolist()

In [None]:
# Create and save model
model = MovieGenreRecommender(df, vectorizer, kmeans)

with open('movie_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Model saved as movie_model.pkl")