In [1]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

## Exploring Data 

In [2]:
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies= pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title'], usecols=[0, 1])
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
print(ratings.shape) 
print(movies.shape)

(100000, 4)
(1682, 2)


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  1682 non-null   int64 
 1   title     1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


# User Based Filtering

## BUILD USER-ITEM MATRIX

In [7]:
user_item_matrix= ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
user_item_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
user_item_matrix.shape

(943, 1682)

## COMPUTE USER SIMILARITY

In [9]:
similar_users= cosine_similarity(user_item_matrix)
print(similar_users.shape) 


(943, 943)


In [10]:
similar_users_df= pd.DataFrame(similar_users, 
                               index=user_item_matrix.index, 
                               columns=user_item_matrix.index
                              )

similar_users_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [11]:
# top_10_similar_users= similar_users_df.loc[1].sort_values(ascending= False)[1:11]
# top_10_similar_users

Here we look at what those top 10 similar users rated highly & 
Recommend movies that user 1 hasn’t seen but similar users loved

## GENERATING RECOMMENDATIONS

Generate movie recommendations for user 1 based on what similar users liked


In [12]:
# #Get Ratings from Similar Users
# similar_users_movie_ratings= ratings[ratings['user_id'].isin(top_10_similar_users.index)]
# similar_users_movie_ratings

In [13]:
# similar_ratings= similar_users_movie_ratings.copy()

# similar_ratings['similarity']= similar_ratings['user_id'].map(top_10_similar_users)

# similar_ratings.head()

In [14]:
# #Compute Weighted Average Rating for Each Movie


# # Create a new column: weighted_rating = rating * similarity
# similar_ratings['weighted_rating']= similar_ratings['rating']* similar_ratings['similarity']



# # Group by movie_id and sum weighted ratings and similarity weights
# grouped_movies= similar_ratings.groupby('movie_id').agg({
#                 'weighted_rating': 'sum', 
#                 'similarity': 'sum'
                
# })

# grouped_movies


In [15]:
# grouped_movies['recommendation_score']= (grouped_movies['weighted_rating']) / (grouped_movies['similarity'])
# grouped_movies

In [16]:
# # Remove movies already rated by user 1
# user1_movies= ratings[ratings['user_id']== 1]['movie_id']
# user1_movies

In [17]:
# recommended_movies= grouped_movies[~grouped_movies.index.isin(user1_movies)]
# recommended_movies

In [18]:
# top_10= recommended_movies['recommendation_score'].sort_values(ascending=False).head(10)
# top_10

In [19]:
# recommended_movies_titles= movies.set_index('movie_id').loc[top_10.index, 'title']
# recommended_movies_titles

In [20]:
# recommended_movies_titles.values

### Make a function to recommend movies (Better)


so far what I did " recommending for user 1" we need to generalize this for other users 

In [21]:
def recommend_movies(user_id, num_recommendations, users_df= similar_users_df, ratings_df= ratings, movies_df= movies):
   
    similar_users= similar_users_df.loc[user_id].sort_values(ascending=False)[1:11]

    similar_ratings= ratings_df[ratings_df['user_id'].isin(similar_users.index)].copy()

    similar_ratings['similarity']= similar_ratings['user_id'].map(similar_users)

    similar_ratings['weighted_rating']= similar_ratings['rating']* similar_ratings['similarity']

    movies_groups= similar_ratings.groupby('movie_id').agg({
        'weighted_rating':'sum',
        'similarity': 'sum'
    })

    movies_groups['predicted_rating']= (movies_groups['weighted_rating']) / (movies_groups['similarity']) 

    selected_user_rated_movies= ratings_df[ratings_df['user_id']== user_id]['movie_id']

    recommed= movies_groups[~movies_groups.index.isin(selected_user_rated_movies)]

    top_movies= recommed['predicted_rating'].sort_values(ascending=False).head(num_recommendations)
    top_movies_titles= movies_df.set_index('movie_id').loc[top_movies.index, 'title']

    return top_movies_titles.values

    

In [22]:
# Testing the function 

print("User 1:")
print(recommend_movies(1, 5))

print("\nUser 10:")
print(recommend_movies(42, 4))

print("\nUser 100:")
print(recommend_movies(900, 3))

User 1:
['Wings of Desire (1987)' 'American in Paris, An (1951)'
 'Waiting for Guffman (1996)' 'Stealing Beauty (1996)'
 'Walk in the Clouds, A (1995)']

User 10:
['Cinema Paradiso (1988)' 'Close Shave, A (1995)'
 'Immortal Beloved (1994)' 'Once Upon a Time in the West (1969)']

User 100:
['Groundhog Day (1993)' 'Apt Pupil (1998)' 'L.A. Confidential (1997)']


### Evaluation Function 

In [23]:
def evaluate_recommendations(user_id, ratings_df= ratings, movies_df= movies, n_recommed=5): 
    selected_user_ratings= ratings_df[ratings_df['user_id']== user_id]

    if len(selected_user_ratings)<5:
        return None 

    train_user_ratings, test_user_ratings = train_test_split(selected_user_ratings, test_size=0.2, random_state=42)

    test_indices= test_user_ratings.index
    train_ratings = ratings_df[~ratings_df.index.isin(test_indices)]
    
    train_matrix= train_ratings.pivot(index='user_id', columns= 'movie_id', values= 'rating').fillna(0)


    train_user_similarity= cosine_similarity(train_matrix)
    train_similar_users_df= pd.DataFrame(train_user_similarity, index=train_matrix.index, columns= train_matrix.index)

    if user_id not in train_similar_users_df.index:
        return None 

    top_similar= train_similar_users_df.loc[user_id].sort_values(ascending= False)[1:11]

    
    similar_ratings= train_ratings[train_ratings['user_id'].isin(top_similar.index)].copy()
    similar_ratings['similarity']= similar_ratings['user_id'].map(top_similar)
    similar_ratings['weighted_rating']= similar_ratings['rating'] * similar_ratings['similarity']


    movies_grp= similar_ratings.groupby('movie_id').agg({'weighted_rating':'sum', 'similarity':'sum'})
    movies_grp['pred_score']= movies_grp['weighted_rating']/ movies_grp['similarity']

   
    train_rated_movies= train_user_ratings['movie_id'].tolist()
    pred_recommedations= movies_grp[~movies_grp.index.isin(train_rated_movies)]

    if pred_recommedations.empty:
        return None 

    top_recommendations= pred_recommedations['pred_score'].sort_values(ascending= False).head(n_recommed)

    

    #Evaluation Part : How many matchs between top_recommendations and  user test split
    top_rated_test= test_user_ratings[test_user_ratings['rating'] >= 4]['movie_id'].tolist() 
    union_of_two= set(top_recommendations.index) & set(top_rated_test)
    precision_k= len(union_of_two) / n_recommed

    return {
        'user_id': user_id,
        'precision@k': precision_k,
        'recommended': list(top_recommendations.index),
        'liked_by_user': top_rated_test,
        'count': len(union_of_two)
    }


    

    

    

In [24]:
result = evaluate_recommendations(
    user_id=1,
    ratings_df=ratings,
    movies_df=movies,
    n_recommed=5
)

# print(result)

if result:
    print(f"User {result['user_id']}: Precision@5 = {result['precision@k']:.2f}")
    print("Recommended:", result['recommended'])
    print("Actually liked (test):", result['liked_by_user'])
    print("The number of relevant:", result['count'] )

User 1: Precision@5 = 0.20
Recommended: [343, 736, 1589, 1467, 114]
Actually liked (test): [39, 163, 169, 238, 157, 256, 223, 195, 146, 258, 28, 235, 80, 114, 107, 135, 25, 171, 23, 175, 230, 134, 190, 198, 228, 44, 87, 144, 216, 196]
The number of relevant: 1


# Item Based Filtering

## Compute Item Similarity

In [25]:
item_user_matrix= user_item_matrix.T

item_user_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
similar_items= cosine_similarity(item_user_matrix)

item_similarity_df= pd.DataFrame(
    similar_items, 
    index=item_user_matrix.index, 
    columns=item_user_matrix.index
)


item_similarity_df.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
2,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
3,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
4,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
5,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


### item-based recommendation function

* For a given user, predict ratings for unrated movies using item similarity

* Recommend top-N

In [27]:
# selected_ratings= ratings[ratings['user_id']==1]
# selected_ratings

In [28]:
# selected_movies= dict(zip(selected_ratings['movie_id'], selected_ratings['rating']))
# selected_movies

In [29]:
def recommend_movies_for_user(user_id, ratings_df= ratings, similarity_df= item_similarity_df, n= 10): 
    selected_ratings= ratings_df[ratings_df['user_id']== user_id]

    if selected_ratings.empty:
        return None 

    #{movie_id: rating}
    selected_movies= dict(zip(selected_ratings['movie_id'], selected_ratings['rating']))

    predictions={}

    all_movies= similarity_df.index

    for movie in all_movies: 
        if movie in selected_movies:
            continue
        weighted_sum=0
        similarity_sum = 0 

        for movie_id, rating in selected_movies.items(): 
            if movie_id in similarity_df.columns:
                sim = similarity_df.loc[movie, movie_id]
                if sim > 0: 
                    weighted_sum+= sim * rating 
                    similarity_sum+= sim 
        if similarity_sum > 0: 
            predictions[movie] = weighted_sum / similarity_sum
            
    if not predictions:
        return []
        
    top_movies= sorted(predictions.items(), key=lambda x :x[1], reverse= True)[:n]
    return [ movie for movie, _ in top_movies]

In [30]:
user_id = 1
print(f"\n🎬 Getting recommendations for user {user_id}...")
recommended_ids = recommend_movies_for_user(user_id, n=5)

if recommended_ids:
    # Add movie titles
    movie_titles = movies.set_index('movie_id').loc[recommended_ids, 'title']
    print("\n🌟 Your Recommendations:")
    for i, (movie_id, title) in enumerate(movie_titles.items(), 1):
        print(f"{i}. {title} (ID: {movie_id})")
else:
    print("No recommendations available.")


🎬 Getting recommendations for user 1...

🌟 Your Recommendations:
1. Cyclo (1995) (ID: 1156)
2. Office Killer (1997) (ID: 1601)
3. Little City (1998) (ID: 1656)
4. Death in Brunswick (1991) (ID: 1593)
5. Mamma Roma (1962) (ID: 1674)
