In [97]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

## Exploring Data 

In [3]:
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
movies= pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title'], usecols=[0, 1])
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
print(ratings.shape) 
print(movies.shape)

(100000, 4)
(1682, 2)


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  1682 non-null   int64 
 1   title     1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


## BUILD USER-ITEM MATRIX

In [9]:
user_item_matrix= ratings.pivot(index='user_id', columns='movie_id', values='rating')
filled_user_item_matrix= user_item_matrix.fillna(0)
filled_user_item_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
filled_user_item_matrix.shape

(943, 1682)

## COMPUTE USER SIMILARITY

In [14]:
similar_users= cosine_similarity(filled_user_item_matrix)
print(similar_users.shape) 


(943, 943)


In [43]:
similar_users_df= pd.DataFrame(similar_users, 
                               index=filled_user_item_matrix.index, 
                               columns=filled_user_item_matrix.index
                              )

similar_users_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [48]:
top_10_similar_users= similar_users_df.loc[1].sort_values(ascending= False)[1:11]
top_10_similar_users

user_id
916    0.569066
864    0.547548
268    0.542077
92     0.540534
435    0.538665
457    0.538476
738    0.527031
429    0.525950
303    0.525718
276    0.524523
Name: 1, dtype: float64

Here we look at what those top 10 similar users rated highly & 
Recommend movies that user 1 hasn’t seen but similar users loved

## GENERATING RECOMMENDATIONS

Generate movie recommendations for user 1 based on what similar users liked


In [65]:
#Get Ratings from Similar Users
similar_users_movie_ratings= ratings[ratings['user_id'].isin(top_10_similar_users.index)]
similar_users_movie_ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
15,303,785,3,879485318
38,276,796,1,874791932
43,276,564,3,874791805
72,92,1049,1,890251826
83,276,54,3,874791025
...,...,...,...,...
99839,303,363,1,879485134
99843,916,148,2,880843892
99963,429,199,5,882386006
99980,864,685,4,888891900


In [66]:
similar_ratings= similar_users_movie_ratings.copy()

similar_ratings['similarity']= similar_ratings['user_id'].map(top_10_similar_users)

similar_ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,similarity
15,303,785,3,879485318,0.525718
38,276,796,1,874791932,0.524523
43,276,564,3,874791805,0.524523
72,92,1049,1,890251826,0.540534
83,276,54,3,874791025,0.524523


In [77]:
#Compute Weighted Average Rating for Each Movie


# Create a new column: weighted_rating = rating * similarity
similar_ratings['weighted_rating']= similar_ratings['rating']* similar_ratings['similarity']



# Group by movie_id and sum weighted ratings and similarity weights
grouped_movies= similar_ratings.groupby('movie_id').agg({
                'weighted_rating': 'sum', 
                'similarity': 'sum'
                # 'user_id':lambda x: len(x)   # Count of ratings per movie
})

grouped_movies


Unnamed: 0_level_0,weighted_rating,similarity
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,23.113803,5.379586
2,15.591989,4.841110
3,8.067888,3.225997
4,21.518345,5.379586
5,9.761857,3.246052
...,...,...
1531,1.642645,0.547548
1545,1.051900,0.525950
1552,1.615994,0.538665
1597,1.707197,0.569066


In [78]:
grouped_movies['recommendation_score']= (grouped_movies['weighted_rating']) / (grouped_movies['similarity'])
grouped_movies

Unnamed: 0_level_0,weighted_rating,similarity,recommendation_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,23.113803,5.379586,4.296576
2,15.591989,4.841110,3.220747
3,8.067888,3.225997,2.500897
4,21.518345,5.379586,4.000000
5,9.761857,3.246052,3.007301
...,...,...,...
1531,1.642645,0.547548,3.000000
1545,1.051900,0.525950,2.000000
1552,1.615994,0.538665,3.000000
1597,1.707197,0.569066,3.000000


In [79]:
# Remove movies already rated by user 1
user1_movies= ratings[ratings['user_id']== 1]['movie_id']
user1_movies

202       61
305      189
333       33
334      160
478       20
        ... 
92049     28
92487    172
94019    122
96699    152
99073     94
Name: movie_id, Length: 272, dtype: int64

In [80]:
recommended_movies= grouped_movies[~grouped_movies.index.isin(user1_movies)]
recommended_movies

Unnamed: 0_level_0,weighted_rating,similarity,recommendation_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
273,16.705670,4.314079,3.872360
274,5.313552,1.591006,3.339743
275,6.986373,1.611974,4.334048
276,16.701287,3.771814,4.427919
277,3.680953,1.051668,3.500110
...,...,...,...
1531,1.642645,0.547548,3.000000
1545,1.051900,0.525950,2.000000
1552,1.615994,0.538665,3.000000
1597,1.707197,0.569066,3.000000


In [82]:
top_10= recommended_movies['recommendation_score'].sort_values(ascending=False).head(10)
top_10

movie_id
1168    5.0
313     5.0
1009    5.0
1019    5.0
1007    5.0
963     5.0
853     5.0
855     5.0
736     5.0
632     5.0
Name: recommendation_score, dtype: float64

In [83]:
recommended_movies_titles= movies.set_index('movie_id').loc[top_10.index, 'title']
recommended_movies_titles

movie_id
1168                         Little Buddha (1993)
313                                Titanic (1997)
1009                       Stealing Beauty (1996)
1019    Die xue shuang xiong (Killer, The) (1989)
1007                   Waiting for Guffman (1996)
963       Some Folks Call It a Sling Blade (1993)
853                              Braindead (1992)
855                                   Diva (1981)
736                            Shadowlands (1993)
632                        Sophie's Choice (1982)
Name: title, dtype: object

In [84]:
recommended_movies_titles.values

array(['Little Buddha (1993)', 'Titanic (1997)', 'Stealing Beauty (1996)',
       'Die xue shuang xiong (Killer, The) (1989)',
       'Waiting for Guffman (1996)',
       'Some Folks Call It a Sling Blade (1993)', 'Braindead (1992)',
       'Diva (1981)', 'Shadowlands (1993)', "Sophie's Choice (1982)"],
      dtype=object)

### Make a function to recommend movies


so far what I did " recommending for user 1" we need to generalize this for other users 

In [93]:
def recommend_movies(user_id, num_recommendations, users_df= similar_users_df, ratings_df= ratings, movies_df= movies):
   
    similar_users= similar_users_df.loc[user_id].sort_values(ascending=False)[1:11]

    similar_ratings= ratings[ratings['user_id'].isin(similar_users.index)].copy()

    similar_ratings['similarity']= similar_ratings['user_id'].map(similar_users)

    similar_ratings['weighted_rating']= similar_ratings['rating']* similar_ratings['similarity']

    movies_groups= similar_ratings.groupby('movie_id').agg({
        'weighted_rating':'sum',
        'similarity': 'sum'
    })

    movies_groups['predicted_rating']= (movies_groups['weighted_rating']) / (movies_groups['similarity']) 

    selected_user_rated_movies= ratings[ratings['user_id']== user_id]['movie_id']

    recommed= movies_groups[~movies_groups.index.isin(selected_user_rated_movies)]

    top_movies= recommed['predicted_rating'].sort_values(ascending=False).head(num_recommendations)
    top_movies_titles= movies.set_index('movie_id').loc[top_movies.index, 'title']

    return top_movies_titles.values

    

In [96]:
# Testing the function 

print("User 1:")
print(recommend_movies(1, 5))

print("\nUser 10:")
print(recommend_movies(42, 4))

print("\nUser 100:")
print(recommend_movies(900, 3))

User 1:
['Little Buddha (1993)' 'Titanic (1997)' 'Stealing Beauty (1996)'
 'Die xue shuang xiong (Killer, The) (1989)' 'Waiting for Guffman (1996)']

User 10:
['Live Nude Girls (1995)' 'Charade (1963)'
 'Once Upon a Time in the West (1969)' 'Screamers (1995)']

User 100:
['Get Shorty (1995)' 'Bread and Chocolate (Pane e cioccolata) (1973)'
 'My Man Godfrey (1936)']


### Evaluation Function 

In [98]:
def evaluate_recommendations(user_id, ratings_df= ratings, movies_df= movies, n_recommed=5): 
    selected_user_ratings= ratings_df[ratings_df['user_id']== user_id]

    if len(selected_user_ratings)<5:
        return None 

    train_user_ratings, test_user_ratings = train_test_split(selected_user_ratings, test_size=0.2, random_state=42)

    test_indcies= test_user_ratings.index
    train_ratings = ratings_df[~ratings_df.index.isin(test_indices)]
    
    train_matrix= train_ratings.pivot(index='user_id', columns= 'movie_id', values= 'rating').fillna(0)


    train_user_similarity= cosine_similarity(train_matrix)
    train_similar_users_df= pd.DataFrame(train_user_similarity, index=train_matrix.index, columns= train_matrix.index)

    if user_id not in train_similar_users_df.index:
        return None 

    top_similar= train_similar_users_df.loc[user_id].sort_values(ascending= False)[1:11]

    similar_ratings= train_ratings[train_ratings['user_id'].isin(top_similar.index)].copy()

    similar_ratings['similarity']= similar_ratings['user_id'].map(top_similar)
    similar_ratings['weighted_rating']= similar_ratings['rating'] * similar_ratings['similarity']


    movies_grp= similar_ratings.groupby('movie_id').agg({'weighted_rating':'sum', 'similarity':'sum'})
    movies_grp['pred_score']= movies_grp['weighted_rating']/ movies_grp['similarity']

    train_rated_movies= train_ratings[train_ratings['user_id']== user_id]['movie_id']
    pred_recommedations= movies_grp[~movies_grp.index.isin(train_rated_movies)]

    

    

In [None]:
_sc