### *Importing & Loading what we need* 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import pickle

train = pd.read_csv('ml-100k/u1.base', sep='\t', names=columns)
test = pd.read_csv('ml-100k/u1.test', sep='\t', names=columns)

columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)

movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movie_cols, encoding='latin-1')

NameError: name 'columns' is not defined

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
print("\nShape of the ratings:", ratings.shape)

In [None]:
print('Unique Users:', ratings.user_id.nunique())
print('Unique Movies:', ratings.movie_id.nunique())

In [None]:
print("\nShape of the ratings:", movies.shape)

In [None]:
print("\nStatistical Summary:")
ratings.describe()

### *EDA & Data Visualisation* 

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='rating', data=ratings, palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
movie_rating_count = ratings.groupby('movie_id').size()
plt.figure(figsize=(8,5))
sns.histplot(movie_rating_count, bins=40, kde=True)
plt.title("Number of Ratings per Movie")
plt.xlabel("Ratings Count")
plt.ylabel("Number of Movies")
plt.show()

In [None]:
most_rated = ratings.groupby('movie_id').size().sort_values(ascending=False).head(10)
most_rated = most_rated.reset_index().merge(movies[['movie_id', 'title']], on='movie_id')

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='title', y=0, data=most_rated, palette='mako')
plt.xticks(rotation=75)
plt.title('Top 10 Most Rated Movies')
plt.ylabel('Number of Ratings')
plt.show()

### *Recommend movies based on similar users*

In [None]:
user_item_matrix = train.pivot_table(index='user_id', columns='movie_id', values='rating')

# Compute user similarity
user_similarity = pd.DataFrame(cosine_similarity(user_item_matrix.fillna(0)),
                               index=user_item_matrix.index,
                               columns=user_item_matrix.index)

In [None]:
def recommend_user_based(user_id, n_recommendations=5):
    similar_users = user_similarity[user_id].sort_values(ascending=False)[1:6].index
    similar_users_ratings = user_item_matrix.loc[similar_users]
    mean_ratings = similar_users_ratings.mean(axis=0)
    
    user_rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()].index
    recommendations = mean_ratings.drop(user_rated_movies).sort_values(ascending=False).head(n_recommendations)
    recommended_movies = pd.merge(recommendations.reset_index(), movies, on='movie_id')
    return recommended_movies[['title', 0]].rename(columns={0: 'Predicted Rating'})

In [None]:
print("\n User-based recommendations for User 10")
print(recommend_user_based(10))

### *Recommend based on similar items*

In [None]:
item_similarity = pd.DataFrame(cosine_similarity(user_item_matrix.fillna(0).T),
                               index=user_item_matrix.columns,
                               columns=user_item_matrix.columns)

In [None]:
def recommend_item_based(user_id, n_recommendations=5):
    user_ratings = user_item_matrix.loc[user_id].dropna()
    scores = {}
    
    for movie, rating in user_ratings.items():
        similar_items = item_similarity[movie].drop(movie)
        for similar_movie, sim_score in similar_items.items():
            if similar_movie not in user_ratings.index:
                scores[similar_movie] = scores.get(similar_movie, 0) + sim_score * rating
                
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    recommended_movies = pd.DataFrame(sorted_scores, columns=['movie_id', 'score'])
    recommended_movies = pd.merge(recommended_movies, movies, on='movie_id')
    return recommended_movies[['title', 'score']]

In [None]:
print("\nItem-based recommendations for User 10")
print(recommend_item_based(10))

### *EVALUATION (Precision@K)*

In [None]:
def precision_at_k(test_data, user_based=True, k=5):
    hits, total = 0, 0
    
    for user in test_data['user_id'].unique():
        user_test_movies = test_data[test_data['user_id'] == user]['movie_id'].tolist()
        if user_based:
            recs = recommend_user_based(user, n_recommendations=k)
        else:
            recs = recommend_item_based(user, n_recommendations=k)
        recommended_movies = recs['title'].tolist()
        actual_movies = pd.merge(test_data, movies, on='movie_id')
        actual_movies = actual_movies[actual_movies['user_id'] == user]['title'].tolist()
        hits += len(set(recommended_movies) & set(actual_movies))
        total += k
    
    return hits / total

print("\n📏 Evaluating precision@5 ... (this might take a few minutes)")
precision_user = precision_at_k(test, user_based=True, k=5)
precision_item = precision_at_k(test, user_based=False, k=5)

print(f" Precision@5 (User-based): {precision_user:.4f}")
print(f" Precision@5 (Item-based): {precision_item:.4f}")

In [None]:
with open('user_similarity.pkl', 'wb') as f:
    pickle.dump(user_similarity, f)

with open('item_similarity.pkl', 'wb') as f:
    pickle.dump(item_similarity, f)

print("\nModels saved: user_similarity.pkl & item_similarity.pkl")