In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
from surprise import SVD, SVDpp, NMF as SurpriseNMF, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from collections import defaultdict
import pickle, joblib
from scipy.sparse import save_npz, csr_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
ratings = pd.read_csv("/kaggle/input/movielens-1m/ratings.csv")
movies = pd.read_csv("/kaggle/input/movielens-1m/movies.csv")

In [3]:
# Display first few rows
print("Ratings:")
print(ratings.head())

Ratings:
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [4]:
print("\nMovies:")
print(movies.head())


Movies:
   movieId                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [5]:
# ✅ Null value check
print("Null Value Check:\n")
print("Ratings:\n", ratings.isnull().sum())
print("\nMovies:\n", movies.isnull().sum())

Null Value Check:

Ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Movies:
 movieId    0
title      0
genres     0
dtype: int64


In [6]:
# ✅ Duplicate check
print("Duplicate Check:\n")
print("Ratings:", ratings.duplicated().sum())
print("Movies:", movies.duplicated().sum())

Duplicate Check:

Ratings: 0
Movies: 0


In [7]:
# Top 10 most-rated movies
top_rated = ratings['movieId'].value_counts().head(10)
top_rated_movies = movies[movies['movieId'].isin(top_rated.index)]
top_rated_movies = top_rated_movies.merge(top_rated.rename('rating_count'), left_on='movieId', right_index=True)
print("\n🎬 Top 10 Most Rated Movies:", top_rated_movies[['title', 'rating_count']])


🎬 Top 10 Most Rated Movies:                                                   title  rating_count
257           Star Wars: Episode IV - A New Hope (1977)          2991
476                                Jurassic Park (1993)          2672
585                   Terminator 2: Judgment Day (1991)          2649
589                    Silence of the Lambs, The (1991)          2578
1178  Star Wars: Episode V - The Empire Strikes Back...          2990
1192  Star Wars: Episode VI - Return of the Jedi (1983)          2883
1250                          Back to the Future (1985)          2583
1959                         Saving Private Ryan (1998)          2653
2502                                 Matrix, The (1999)          2590
2789                             American Beauty (1999)          3428


In [8]:
# Drop 'timestamp' columns if they exist
if 'timestamp' in ratings.columns:
    ratings = ratings.drop(columns=['timestamp'])

# Show cleaned versions
print("Cleaned Ratings:\n", ratings.head())

Cleaned Ratings:
    userId  movieId  rating
0       1     1193       5
1       1      661       3
2       1      914       3
3       1     3408       4
4       1     2355       5


In [9]:
# Merge ratings with movie metadata
ratings_movies = ratings.merge(movies, on='movieId', how='left')
print("\nMerged Ratings and Movies:\n", ratings_movies.head())


Merged Ratings and Movies:
    userId  movieId  rating                                   title  \
0       1     1193       5  One Flew Over the Cuckoo's Nest (1975)   
1       1      661       3        James and the Giant Peach (1996)   
2       1      914       3                     My Fair Lady (1964)   
3       1     3408       4                  Erin Brockovich (2000)   
4       1     2355       5                    Bug's Life, A (1998)   

                         genres  
0                         Drama  
1  Animation|Children's|Musical  
2               Musical|Romance  
3                         Drama  
4   Animation|Children's|Comedy  


In [10]:
# Fill NaNs in genres (if any)
movies['genres'] = movies['genres'].fillna('')

# 🔥 NEW: Sample data for memory efficiency
print("\n📊 Dataset Info:")
print(f"Total ratings: {len(ratings):,}")
print(f"Total users: {ratings['userId'].nunique():,}")
print(f"Total movies: {ratings['movieId'].nunique():,}")

# For development, let's work with a sample
sample_users = ratings['userId'].unique()[:5000]
ratings_sample = ratings[ratings['userId'].isin(sample_users)]
print(f"\n🎯 Working with sample: {len(ratings_sample):,} ratings from {len(sample_users):,} users")


📊 Dataset Info:
Total ratings: 1,000,209
Total users: 6,040
Total movies: 3,706

🎯 Working with sample: 831,852 ratings from 5,000 users


In [11]:
# Full dataset
working_ratings = ratings

user_item_matrix = working_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print(f"User-Item Matrix Shape: {user_item_matrix.shape}")
print("Sample of User-Item Matrix:")
print(user_item_matrix.iloc[:5, :5])

# Convert to sparse matrix for memory efficiency
user_item_sparse = csr_matrix(user_item_matrix.values)
print(f"Sparse matrix density: {user_item_sparse.nnz / (user_item_sparse.shape[0] * user_item_sparse.shape[1]):.4f}")

User-Item Matrix Shape: (6040, 3706)
Sample of User-Item Matrix:
movieId    1    2    3    4    5
userId                          
1        5.0  0.0  0.0  0.0  0.0
2        0.0  0.0  0.0  0.0  0.0
3        0.0  0.0  0.0  0.0  0.0
4        0.0  0.0  0.0  0.0  0.0
5        0.0  0.0  0.0  0.0  0.0
Sparse matrix density: 0.0447


# **User-Based Collaborative Filtering**

In [12]:
user_similarity = cosine_similarity(user_item_sparse)
user_similarity_df = pd.DataFrame(user_similarity, 
                                 index=user_item_matrix.index, 
                                 columns=user_item_matrix.index)
print(f"User Similarity Matrix Shape: {user_similarity_df.shape}")
print("Sample User Similarities:")
print(user_similarity_df.iloc[:5, :5])

User Similarity Matrix Shape: (6040, 6040)
Sample User Similarities:
userId         1         2         3         4         5
userId                                                  
1       1.000000  0.096382  0.120610  0.132455  0.090158
2       0.096382  1.000000  0.151479  0.171176  0.114394
3       0.120610  0.151479  1.000000  0.151227  0.062907
4       0.132455  0.171176  0.151227  1.000000  0.045094
5       0.090158  0.114394  0.062907  0.045094  1.000000


In [13]:
def get_user_based_recommendations(user_id, n=10, min_similarity=0.1):
    """Get recommendations based on similar users"""
    if user_id not in user_similarity_df.index:
        print(f"❌ User {user_id} not found in similarity matrix")
        return pd.DataFrame()
    
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]  # Exclude self
    similar_users = similar_users[similar_users > min_similarity]
    
    if len(similar_users) == 0:
        print(f"❌ No similar users found for user {user_id}")
        return pd.DataFrame()
    
    # Get movies rated by user
    user_movies = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    
    # Calculate weighted ratings from similar users
    recommendations = {}
    
    for similar_user, similarity in similar_users.head(20).items():  # Top 20 similar users
        similar_user_movies = user_item_matrix.loc[similar_user]
        similar_user_movies = similar_user_movies[similar_user_movies > 0]
        
        # Only consider movies not seen by target user
        for movie_id, rating in similar_user_movies.items():
            if movie_id not in user_movies:
                if movie_id not in recommendations:
                    recommendations[movie_id] = []
                recommendations[movie_id].append(rating * similarity)
    
    # Calculate average weighted ratings
    final_recommendations = {}
    for movie_id, weighted_ratings in recommendations.items():
        final_recommendations[movie_id] = np.mean(weighted_ratings)
    
    # Sort and get top N
    sorted_recs = sorted(final_recommendations.items(), key=lambda x: x[1], reverse=True)[:n]
    
    # Get movie titles
    movie_ids = [rec[0] for rec in sorted_recs]
    scores = [rec[1] for rec in sorted_recs]
    
    result_df = movies[movies['movieId'].isin(movie_ids)].copy()
    result_df['predicted_rating'] = result_df['movieId'].map(dict(sorted_recs))
    
    return result_df[['title', 'genres', 'predicted_rating']].sort_values('predicted_rating', ascending=False)

In [14]:
# Test user-based recommendations
test_user = user_item_matrix.index[0]
print(f"\n🎯 User-Based Recommendations for User {test_user}:")
user_recs = get_user_based_recommendations(test_user)
print(user_recs)


🎯 User-Based Recommendations for User 1:
                                              title  \
3229                             Boiler Room (2000)   
1603            Fast, Cheap & Out of Control (1997)   
2855               Drunken Master (Zui quan) (1979)   
2931  Princess Mononoke, The (Mononoke Hime) (1997)   
1211                              Annie Hall (1977)   
982                                Big Night (1996)   
1046                                Swingers (1996)   
1502                        Addicted to Love (1997)   
731      Ghost in the Shell (Kokaku kidotai) (1995)   
56                     Home for the Holidays (1995)   

                          genres  predicted_rating  
3229                       Drama          2.060585  
1603                 Documentary          1.960552  
2855               Action|Comedy          1.960552  
2931  Action|Adventure|Animation          1.960552  
1211              Comedy|Romance          1.828631  
982                        Drama  

# **Item-Based Collaborative Filtering**

In [15]:
# Transpose for item-item similarity
item_item_matrix = user_item_matrix.T
item_similarity = cosine_similarity(csr_matrix(item_item_matrix.values))
item_similarity_df = pd.DataFrame(item_similarity,
                                 index=item_item_matrix.index,
                                 columns=item_item_matrix.index)
print(f"Item Similarity Matrix Shape: {item_similarity_df.shape}")

Item Similarity Matrix Shape: (3706, 3706)


In [16]:
def get_item_based_recommendations(user_id, n=10, min_similarity=0.1):
    """Get recommendations based on item similarity"""
    if user_id not in user_item_matrix.index:
        print(f"❌ User {user_id} not found")
        return pd.DataFrame()
    
    # Get movies rated by user
    user_ratings = user_item_matrix.loc[user_id]
    user_movies = user_ratings[user_ratings > 0]
    
    if len(user_movies) == 0:
        print(f"❌ User {user_id} has no ratings")
        return pd.DataFrame()
    
    # Calculate item-based recommendations
    recommendations = {}
    
    for movie_id, rating in user_movies.items():
        # Get similar movies
        similar_movies = item_similarity_df[movie_id].sort_values(ascending=False)[1:]  # Exclude self
        similar_movies = similar_movies[similar_movies > min_similarity]
        
        for similar_movie, similarity in similar_movies.head(10).items():
            if similar_movie not in user_movies.index:  # Not seen by user
                if similar_movie not in recommendations:
                    recommendations[similar_movie] = []
                recommendations[similar_movie].append(rating * similarity)
    
    # Calculate final scores
    final_recommendations = {}
    for movie_id, weighted_ratings in recommendations.items():
        final_recommendations[movie_id] = np.mean(weighted_ratings)
    
    # Sort and get top N
    sorted_recs = sorted(final_recommendations.items(), key=lambda x: x[1], reverse=True)[:n]
    
    # Get movie titles
    movie_ids = [rec[0] for rec in sorted_recs]
    result_df = movies[movies['movieId'].isin(movie_ids)].copy()
    result_df['predicted_rating'] = result_df['movieId'].map(dict(sorted_recs))
    
    return result_df[['title', 'genres', 'predicted_rating']].sort_values('predicted_rating', ascending=False)

In [17]:
# Test item-based recommendations
print(f"\n🎯 Item-Based Recommendations for User {test_user}:")
item_recs = get_item_based_recommendations(test_user)
print(item_recs)


🎯 Item-Based Recommendations for User 1:
                                                  title  \
108                                   Braveheart (1995)   
1942                  Back to the Future Part II (1989)   
585                   Terminator 2: Judgment Day (1991)   
592                                    Pinocchio (1940)   
589                    Silence of the Lambs, The (1991)   
1178  Star Wars: Episode V - The Empire Strikes Back...   
1196                                       Alien (1979)   
1019                         Alice in Wonderland (1951)   
2018                                   Peter Pan (1953)   
1195                                  GoodFellas (1990)   

                                    genres  predicted_rating  
108                       Action|Drama|War          3.270914  
1942                         Comedy|Sci-Fi          3.113642  
585                 Action|Sci-Fi|Thriller          2.848464  
592                   Animation|Children's          2.77

# **Content-Based Filtering**

In [18]:
# Use top-N popular movies as before
tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Fit nearest neighbors model
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# Function to recommend
def get_content_recommendations(title, n=10):
    idx = movies[movies['title'] == title].index[0]
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    rec_indices = indices.flatten()[1:]
    return movies[['title', 'genres']].iloc[rec_indices]

# Example
print("\n🎯 Content-Based Recommendations for 'Toy Story (1995)':")
print(get_content_recommendations('Toy Story (1995)'))


🎯 Content-Based Recommendations for 'Toy Story (1995)':
                                               title  \
2285                       Rugrats Movie, The (1998)   
3685  Adventures of Rocky and Bullwinkle, The (2000)   
3682                              Chicken Run (2000)   
3542                           Saludos Amigos (1943)   
2073      American Tail: Fievel Goes West, An (1991)   
2072                        American Tail, An (1986)   
1050          Aladdin and the King of Thieves (1996)   
0                                   Toy Story (1995)   
2286                            Bug's Life, A (1998)   
2807                               Thumbelina (1994)   

                           genres  
2285  Animation|Children's|Comedy  
3685  Animation|Children's|Comedy  
3682  Animation|Children's|Comedy  
3542  Animation|Children's|Comedy  
2073  Animation|Children's|Comedy  
2072  Animation|Children's|Comedy  
1050  Animation|Children's|Comedy  
0     Animation|Children's|Comedy  
22

# **Matrix Factorization Comparison**

In [19]:
# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(working_ratings[['userId', 'movieId', 'rating']], reader)

# Split data for evaluation
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define models to compare
models = {
    'SVD': SVD(random_state=42),
    'SVD++': SVDpp(random_state=42),
    'NMF': SurpriseNMF(random_state=42)
}

# Train and evaluate models
model_results = {}
trained_models = {}

print("\n📈 Model Performance Comparison:")
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(trainset)
    predictions = model.test(testset)
    
    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)
    
    model_results[name] = {'RMSE': rmse, 'MAE': mae}
    trained_models[name] = model
    
    print(f"{name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}")


📈 Model Performance Comparison:

Training SVD...
SVD - RMSE: 0.8730, MAE: 0.6849

Training SVD++...
SVD++ - RMSE: 0.8630, MAE: 0.6733

Training NMF...
NMF - RMSE: 0.9169, MAE: 0.7239


# **Precision@K Evaluation Method**

In [20]:
def precision_at_k(predictions, k=10, threshold=3.5):
    """Calculate Precision@K"""
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = {}
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Get top K recommendations
        top_k = user_ratings[:k]
        
        # Count relevant items (rating >= threshold)
        relevant_recommended = sum(1 for (est, true_r) in top_k if true_r >= threshold)
        
        # Precision@K = relevant_recommended / k
        precisions[uid] = relevant_recommended / k if k > 0 else 0
    
    return precisions

In [21]:
def recall_at_k(predictions, k=10, threshold=3.5):
    """Calculate Recall@K"""
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    recalls = {}
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Get top K recommendations
        top_k = user_ratings[:k]
        
        # Count total relevant items for this user
        total_relevant = sum(1 for (est, true_r) in user_ratings if true_r >= threshold)
        
        if total_relevant == 0:
            recalls[uid] = 0
        else:
            # Count relevant items in top K
            relevant_recommended = sum(1 for (est, true_r) in top_k if true_r >= threshold)
            recalls[uid] = relevant_recommended / total_relevant
    
    return recalls

In [22]:
def ndcg_at_k(predictions, k=10):
    """Calculate NDCG@K"""
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    ndcgs = {}
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Get top K
        top_k = user_ratings[:k]
        
        # Calculate DCG
        dcg = sum((2**rel - 1) / np.log2(i + 2) for i, (est, rel) in enumerate(top_k))
        
        # Calculate IDCG (ideal DCG)
        ideal_ratings = sorted([true_r for (est, true_r) in user_ratings], reverse=True)[:k]
        idcg = sum((2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(ideal_ratings))
        
        # NDCG
        ndcgs[uid] = dcg / idcg if idcg > 0 else 0
    
    return ndcgs

In [23]:
# Evaluate best model (SVD++) with Precision@K
best_model = trained_models['SVD++']
predictions = best_model.test(testset)

for k in [5, 10, 20]:
    precisions = precision_at_k(predictions, k=k)
    recalls = recall_at_k(predictions, k=k)
    ndcgs = ndcg_at_k(predictions, k=k)
    
    avg_precision = np.mean(list(precisions.values()))
    avg_recall = np.mean(list(recalls.values()))
    avg_ndcg = np.mean(list(ndcgs.values()))
    
    print(f"K={k:2d} | Precision: {avg_precision:.4f} | Recall: {avg_recall:.4f} | NDCG: {avg_ndcg:.4f}")

K= 5 | Precision: 0.7939 | Recall: 0.4397 | NDCG: 0.8065
K=10 | Precision: 0.6860 | Recall: 0.6378 | NDCG: 0.8356
K=20 | Precision: 0.5337 | Recall: 0.8061 | NDCG: 0.8678


# **Hybrid Recommendation**

In [24]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def hybrid_recommendations(user_id, movie_title=None, n=10, weights={'collaborative': 0.6, 'content': 0.4}):
    """Enhanced hybrid recommendations combining multiple approaches"""
    recommendations = []
    
    # 1. User-based collaborative filtering
    try:
        user_collab = get_user_based_recommendations(user_id, n=n*2)
        if not user_collab.empty:
            user_collab['method'] = 'user_collaborative'
            user_collab['score'] = user_collab['predicted_rating']
            recommendations.append(user_collab)
    except:
        pass
    
    # 2. Item-based collaborative filtering
    try:
        item_collab = get_item_based_recommendations(user_id, n=n*2)
        if not item_collab.empty:
            item_collab['method'] = 'item_collaborative'
            item_collab['score'] = item_collab['predicted_rating']
            recommendations.append(item_collab)
    except:
        pass
    
    # 3. Matrix factorization (SVD++)
    try:
        user_rated = working_ratings[working_ratings['userId'] == user_id]['movieId'].tolist()
        all_movies = working_ratings['movieId'].unique()
        movies_to_predict = [m for m in all_movies if m not in user_rated][:n*3]  # Limit for efficiency
        
        svd_predictions = [best_model.predict(user_id, movie_id) for movie_id in movies_to_predict]
        svd_predictions.sort(key=lambda x: x.est, reverse=True)
        
        svd_movie_ids = [pred.iid for pred in svd_predictions[:n*2]]
        svd_recs = movies[movies['movieId'].isin(svd_movie_ids)].copy()
        svd_recs['method'] = 'matrix_factorization'
        svd_recs['score'] = svd_recs['movieId'].map({pred.iid: pred.est for pred in svd_predictions})
        recommendations.append(svd_recs[['title', 'genres', 'method', 'score']])
    except:
        pass
    
    # 4. Content-based (if movie title provided)
    if movie_title and movie_title in indices:
        try:
            content_recs = get_content_recommendations(movie_title, n=n)
            if isinstance(content_recs, pd.DataFrame) and not content_recs.empty:
                content_recs['method'] = 'content_based'
                content_recs['score'] = 4.0  # Default score for content-based
                recommendations.append(content_recs[['title', 'genres', 'method', 'score']])
        except:
            pass
    
    # Combine all recommendations
    if not recommendations:
        return "❌ No recommendations could be generated"
    
    combined_recs = pd.concat(recommendations, ignore_index=True)
    
    # Remove duplicates by keeping highest scoring version
    combined_recs = combined_recs.sort_values('score', ascending=False).drop_duplicates('title', keep='first')
    
    # Calculate final weighted score (simple average for now)
    final_recs = combined_recs.groupby('title').agg({
        'genres': 'first',
        'score': 'mean',
        'method': lambda x: ', '.join(x.unique())
    }).reset_index()
    
    return final_recs.sort_values('score', ascending=False).head(n)

In [25]:
# Test hybrid recommendations
print(f"\n🎬 Hybrid Recommendations for User {test_user}:")
hybrid_recs = hybrid_recommendations(user_id=test_user, movie_title='Toy Story (1995)')
print(hybrid_recs)


🎬 Hybrid Recommendations for User 1:
                                       title                genres     score  \
28                         GoodFellas (1990)           Crime|Drama  4.358106   
19     Day the Earth Stood Still, The (1951)          Drama|Sci-Fi  4.343721   
30          Hunt for Red October, The (1990)       Action|Thriller  4.333865   
67                            Yojimbo (1961)  Comedy|Drama|Western  4.330007   
23                    Few Good Men, A (1992)           Crime|Drama  4.305021   
27                          Gladiator (2000)          Action|Drama  4.290161   
34                     Mister Roberts (1955)      Comedy|Drama|War  4.262183   
53                  Stand and Deliver (1987)                 Drama  4.214750   
64                  Untouchables, The (1987)    Action|Crime|Drama  4.198777   
45  Shall We Dance? (Shall We Dansu?) (1996)                Comedy  4.164543   

                  method  
28  matrix_factorization  
19  matrix_factorization  


In [26]:
print(f"\n🎬 Hybrid Recommendations for User {test_user} (without seed movie):")
hybrid_recs_no_seed = hybrid_recommendations(user_id=test_user)
print(hybrid_recs_no_seed)


🎬 Hybrid Recommendations for User 1 (without seed movie):
                                       title                genres     score  \
22                         GoodFellas (1990)           Crime|Drama  4.358106   
13     Day the Earth Stood Still, The (1951)          Drama|Sci-Fi  4.343721   
24          Hunt for Red October, The (1990)       Action|Thriller  4.333865   
57                            Yojimbo (1961)  Comedy|Drama|Western  4.330007   
17                    Few Good Men, A (1992)           Crime|Drama  4.305021   
21                          Gladiator (2000)          Action|Drama  4.290161   
28                     Mister Roberts (1955)      Comedy|Drama|War  4.262183   
45                  Stand and Deliver (1987)                 Drama  4.214750   
54                  Untouchables, The (1987)    Action|Crime|Drama  4.198777   
37  Shall We Dance? (Shall We Dansu?) (1996)                Comedy  4.164543   

                  method  
22  matrix_factorization  
13  ma

# **Cold Start Handling**

In [27]:
def handle_cold_start_user(n=10):
    """Recommend popular movies for new users"""
    popular_movies = working_ratings.groupby('movieId').agg({
        'rating': ['count', 'mean']
    }).reset_index()
    popular_movies.columns = ['movieId', 'rating_count', 'avg_rating']
    
    # Filter movies with enough ratings and high average
    popular_movies = popular_movies[
        (popular_movies['rating_count'] >= 50) & 
        (popular_movies['avg_rating'] >= 4.0)
    ].sort_values(['avg_rating', 'rating_count'], ascending=False)
    
    result = movies[movies['movieId'].isin(popular_movies['movieId'].head(n))].copy()
    result = result.merge(popular_movies[['movieId', 'avg_rating', 'rating_count']], on='movieId')
    
    return result[['title', 'genres', 'avg_rating', 'rating_count']].head(n)

In [28]:
def handle_cold_start_movie(movie_title, n=10):
    """Recommend similar movies for new movies using content"""
    return get_content_recommendations(movie_title, n)

print("\n🔥 Cold Start - Popular Movies for New Users:")
cold_start_recs = handle_cold_start_user()
print(cold_start_recs)


🔥 Cold Start - Popular Movies for New Users:
                                               title  \
0                         Usual Suspects, The (1995)   
1                   Shawshank Redemption, The (1994)   
2                            Schindler's List (1993)   
3                              Close Shave, A (1995)   
4                              Godfather, The (1972)   
5      Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)   
6                         Wrong Trousers, The (1993)   
7                     Raiders of the Lost Ark (1981)   
8  Seven Samurai (The Magnificent Seven) (Shichin...   
9                                     Sanjuro (1962)   

                      genres  avg_rating  rating_count  
0             Crime|Thriller    4.517106          1783  
1                      Drama    4.554558          2227  
2                  Drama|War    4.510417          2304  
3  Animation|Comedy|Thriller    4.520548           657  
4         Action|Crime|Drama    4.524966          22

# **Saving Models**

In [29]:
# Save original models
pickle.dump(best_model, open("svdpp_model.pkl", "wb"))
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
save_npz("tfidf_matrix.npz", tfidf_matrix)
joblib.dump(nn, "nn_model.pkl")

# Save new models and matrices
joblib.dump(user_item_matrix, "user_item_matrix.pkl")
joblib.dump(user_similarity_df, "user_similarity_matrix.pkl")
joblib.dump(item_similarity_df, "item_similarity_matrix.pkl")
joblib.dump(trained_models, "all_trained_models.pkl")
joblib.dump(movies, "movies_df.pkl")
joblib.dump(indices, "title_indices.pkl")

# Save evaluation results
joblib.dump(model_results, "model_evaluation_results.pkl")

print("✅ All models and data saved successfully!")

✅ All models and data saved successfully!


# **Summary Report**

In [30]:
print("\n" + "="*60)
print("🎬 MOVIE RECOMMENDATION SYSTEM - SUMMARY REPORT")
print("="*60)

print(f"\n📊 Dataset Statistics:")
print(f"   • Total Ratings: {len(working_ratings):,}")
print(f"   • Total Users: {working_ratings['userId'].nunique():,}")
print(f"   • Total Movies: {working_ratings['movieId'].nunique():,}")
print(f"   • Sparsity: {(1 - working_ratings.shape[0]/(working_ratings['userId'].nunique() * working_ratings['movieId'].nunique()))*100:.2f}%")

print(f"\n🔧 Implemented Methods:")
print(f"   ✅ User-Item Matrix Construction")
print(f"   ✅ User-Based Collaborative Filtering")
print(f"   ✅ Item-Based Collaborative Filtering") 
print(f"   ✅ Content-Based Filtering")
print(f"   ✅ Matrix Factorization (SVD, SVD++, NMF)")
print(f"   ✅ Hybrid Recommendations")
print(f"   ✅ Cold Start Handling")

print(f"\n📈 Evaluation Metrics:")
print(f"   ✅ RMSE & MAE")
print(f"   ✅ Precision@K")
print(f"   ✅ Recall@K") 
print(f"   ✅ NDCG@K")

print(f"\n🏆 Best Model Performance:")
best_rmse = min([results['RMSE'] for results in model_results.values()])
best_model_name = [name for name, results in model_results.items() if results['RMSE'] == best_rmse][0]
print(f"   • {best_model_name}: RMSE = {best_rmse:.4f}")

print("\n" + "="*60)
print("🎉 ENHANCED RECOMMENDATION SYSTEM COMPLETE!")
print("="*60)


🎬 MOVIE RECOMMENDATION SYSTEM - SUMMARY REPORT

📊 Dataset Statistics:
   • Total Ratings: 1,000,209
   • Total Users: 6,040
   • Total Movies: 3,706
   • Sparsity: 95.53%

🔧 Implemented Methods:
   ✅ User-Item Matrix Construction
   ✅ User-Based Collaborative Filtering
   ✅ Item-Based Collaborative Filtering
   ✅ Content-Based Filtering
   ✅ Matrix Factorization (SVD, SVD++, NMF)
   ✅ Hybrid Recommendations
   ✅ Cold Start Handling

📈 Evaluation Metrics:
   ✅ RMSE & MAE
   ✅ Precision@K
   ✅ Recall@K
   ✅ NDCG@K

🏆 Best Model Performance:
   • SVD++: RMSE = 0.8630

🎉 ENHANCED RECOMMENDATION SYSTEM COMPLETE!
