In [1]:
import pandas as pd

# Load the u.data file
file_path = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.data'
column_names = ['user_id', 'item_id', 'rating', 'timestamp']

# Load using tab delimiter
df = pd.read_csv(file_path, sep='\t', names=column_names)

# Preview
print(df.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [3]:
# Create user-item matrix (rows: users, columns: movies, values: ratings)
user_item_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating')

# Preview
user_item_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

user_matrix_filled = user_item_matrix.fillna(0)

user_similarity = cosine_similarity(user_matrix_filled)

user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


user_similarity_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [9]:
target_user = 10

# 1. Similarity scores for user 10
similar_users = user_similarity_df[target_user]

# 2. User 10's ratings
user_ratings = user_item_matrix.loc[target_user]

# 3. Predict ratings for unrated items
predicted_ratings = {}

for item in user_item_matrix.columns:
    if pd.isna(user_ratings[item]):  # Only predict for unrated items
        # Get ratings for this item by other users
        item_ratings = user_item_matrix[item]

        # Compute weighted average (only from users who rated the item)
        rated_by = ~item_ratings.isna()
        weights = similar_users[rated_by]
        ratings = item_ratings[rated_by]

        if weights.sum() > 0:
            predicted_score = np.dot(weights, ratings) / weights.sum()
            predicted_ratings[item] = predicted_score

# 4. Recommend top 5 items
top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 Recommendations for User", target_user)
print(top_recommendations)

Top 5 Recommendations for User 10
[(1500, 5.000000000000001), (814, 5.0), (1122, 5.0), (1189, 5.0), (1201, 5.0)]


In [11]:
# Load movie titles
item_path = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.item'
movie_titles = pd.read_csv(item_path, sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=['item_id', 'title'])

# Convert top recommendations to DataFrame
recommended_ids = [item[0] for item in top_recommendations]
recommended_df = movie_titles[movie_titles['item_id'].isin(recommended_ids)]

print("🎬 Recommended Movies for User 10:")
print(recommended_df)

🎬 Recommended Movies for User 10:
      item_id                                       title
813       814               Great Day in Harlem, A (1994)
1121     1122              They Made Me a Criminal (1939)
1188     1189                          Prefontaine (1997)
1200     1201  Marlene Dietrich: Shadow and Light (1996) 
1499     1500                   Santa with Muscles (1996)


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaNs with 0 for similarity calc
item_matrix_filled = user_item_matrix.T.fillna(0)  # Transpose so items are rows

# Compute cosine similarity between items
item_similarity = cosine_similarity(item_matrix_filled)

# Create DataFrame
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=user_item_matrix.columns, 
                                  columns=user_item_matrix.columns)

# Preview
item_similarity_df.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
2,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
3,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
4,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
5,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


In [15]:
# Movie liked by the user (e.g., item_id = 50)
liked_item_id = 50

# Find top 5 most similar items (excluding the item itself)
similar_items = item_similarity_df[liked_item_id].sort_values(ascending=False)[1:6]

# Map to movie titles
similar_titles = movie_titles[movie_titles['item_id'].isin(similar_items.index)]

print(f"🎥 Because you liked item {liked_item_id}, you might also like:")
print(similar_titles)

🎥 Because you liked item 50, you might also like:
     item_id                            title
0          1                 Toy Story (1995)
126      127            Godfather, The (1972)
171      172  Empire Strikes Back, The (1980)
173      174   Raiders of the Lost Ark (1981)
180      181        Return of the Jedi (1983)
