In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import files
files.upload()  # Upload kaggle.json from your Kaggle account


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"aasiasalahuddin","key":"ce88831f2f968804e69d620212e8e399"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d prajitdatta/movielens-100k-dataset
!unzip -o movielens-100k-dataset.zip -d movielens


Dataset URL: https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset
License(s): CC0-1.0
Downloading movielens-100k-dataset.zip to /content
  0% 0.00/4.77M [00:00<?, ?B/s]
100% 4.77M/4.77M [00:00<00:00, 487MB/s]
Archive:  movielens-100k-dataset.zip
  inflating: movielens/ml-100k/README  
  inflating: movielens/ml-100k/allbut.pl  
  inflating: movielens/ml-100k/mku.sh  
  inflating: movielens/ml-100k/u.data  
  inflating: movielens/ml-100k/u.genre  
  inflating: movielens/ml-100k/u.info  
  inflating: movielens/ml-100k/u.item  
  inflating: movielens/ml-100k/u.occupation  
  inflating: movielens/ml-100k/u.user  
  inflating: movielens/ml-100k/u1.base  
  inflating: movielens/ml-100k/u1.test  
  inflating: movielens/ml-100k/u2.base  
  inflating: movielens/ml-100k/u2.test  
  inflating: movielens/ml-100k/u3.base  
  inflating: movielens/ml-100k/u3.test  
  inflating: movielens/ml-100k/u4.base  
  inflating: movielens/ml-100k/u4.test  
  inflating: movielens/ml-100k/u5.base  
 

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score


In [8]:
# Load u.data
ratings = pd.read_csv(
    'movielens/ml-100k/u.data',
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

# Load u.item (note: encoding and delimiter)
movies = pd.read_csv(
    'movielens/ml-100k/u.item',
    sep='|',
    encoding='latin-1',
    header=None,
    usecols=[0, 1],
    names=['movie_id', 'title']
)

print(ratings.head())
print(movies.head())


   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [9]:
#merge rattings with movie titles
data = pd.merge(ratings, movies, on='movie_id')
print(data.head())


   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)


In [10]:
#create user-item matrix
user_movie_matrix = data.pivot_table(index='user_id', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)


In [11]:
#user based colaborative filtering
# Cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)


In [12]:
#recomend movide to given user_id
def get_user_recommendations(user_id, top_n=5):
    sim_scores = user_similarity_df[user_id]
    weighted_sum = user_movie_matrix.T.dot(sim_scores) / sim_scores.sum()

    watched = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id] > 0].index
    recommendations = weighted_sum.drop(watched)

    return recommendations.sort_values(ascending=False).head(top_n)

# Example: Recommend for user 10
get_user_recommendations(10)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Return of the Jedi (1983),2.612765
"Empire Strikes Back, The (1980)",2.370932
Back to the Future (1985),2.125414
Schindler's List (1993),2.121701
"Fugitive, The (1993)",2.093849


In [13]:
#bonus item based collaborative filtering
item_similarity = cosine_similarity(user_movie_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

def get_similar_movies(movie_title, top_n=5):
    sim_scores = item_similarity_df[movie_title].sort_values(ascending=False)
    return sim_scores.drop(movie_title).head(top_n)

# Example
get_similar_movies('Star Wars (1977)')


Unnamed: 0_level_0,Star Wars (1977)
title,Unnamed: 1_level_1
Return of the Jedi (1983),0.884476
Raiders of the Lost Ark (1981),0.764885
"Empire Strikes Back, The (1980)",0.749819
Toy Story (1995),0.734572
"Godfather, The (1972)",0.697332


In [17]:
#matrix factorization (SVD)
from scipy.sparse.linalg import svds

R = user_movie_matrix.values
user_mean = R.mean(axis=1)
R_demeaned = R - user_mean.reshape(-1, 1)

U, sigma, Vt = svds(R_demeaned, k=20)
sigma = np.diag(sigma)

predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_mean.reshape(-1, 1)
pred_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)

pred_df.head()  # shows the top 5 rows


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.057184,0.053842,0.111068,0.662821,0.192686,1.199187,0.56162,3.841368,0.015409,-0.068526,...,0.013636,0.106801,0.055012,2.9779,1.670341,0.769473,0.678974,0.001204,0.343793,0.067502
2,0.09294,-0.011182,0.192804,0.151837,0.158019,0.526141,-0.258056,-0.588151,0.007454,-0.164739,...,-0.021153,0.022966,-0.023067,-0.026696,-0.095007,-0.030447,0.247541,0.007718,0.078965,-0.005733
3,0.053003,0.005646,-0.171288,0.102526,0.559745,0.200579,0.013494,0.152834,0.017592,-0.02167,...,0.005816,0.050959,0.008538,-0.098832,0.061691,0.041719,0.211886,-0.004864,0.056028,0.019733
4,-0.008396,0.006927,-0.092049,0.221989,0.305568,-0.157194,-0.020434,0.050642,0.017351,0.087369,...,0.023377,0.018197,0.018144,-0.05106,0.166685,0.108166,0.064486,0.004793,0.001875,0.022426
5,-0.047347,0.109362,0.854782,0.128758,-0.21294,-0.24785,0.995233,1.887015,0.052038,0.3417,...,0.055281,0.167215,0.051861,3.922505,0.874194,0.38244,0.246983,0.08266,0.281849,0.015072


In [18]:
print(pred_df)  # prints the entire predicted ratings matrix


title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                         0.057184      0.053842               0.111068   
2                         0.092940     -0.011182               0.192804   
3                         0.053003      0.005646              -0.171288   
4                        -0.008396      0.006927              -0.092049   
5                        -0.047347      0.109362               0.854782   
...                            ...           ...                    ...   
939                       0.068416      0.034204               1.001026   
940                      -0.087270     -0.010785               0.089723   
941                      -0.046141      0.020292               0.129680   
942                       0.130385      0.010420               0.101938   
943                       0.087220     -0.020097               0.116877   

title    12 Angry Men (1

In [19]:
#recomend with svd
def recommend_svd(user_id, top_n=5):
    user_row = pred_df.loc[user_id]
    watched = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id] > 0].index
    recommendations = user_row.drop(watched)

    return recommendations.sort_values(ascending=False).head(top_n)

recommend_svd(10)


Unnamed: 0_level_0,10
title,Unnamed: 1_level_1
Annie Hall (1977),3.969957
"Godfather: Part II, The (1974)",3.861418
To Kill a Mockingbird (1962),3.826994
Schindler's List (1993),3.789543
Babe (1995),3.334892


In [20]:
#save models
import joblib

joblib.dump(user_similarity_df, 'user_similarity.pkl')
joblib.dump(pred_df, 'svd_prediction.pkl')


['svd_prediction.pkl']