In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

org_ratings = pd.read_csv('/content/drive/MyDrive/MIR-Phase3-2/ratings.csv', sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])

org_movies = pd.read_csv('/content/drive/MyDrive/MIR-Phase3-2/movies.csv', sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])

In [3]:
org_movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
org_ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354


### Content Based

**Objective**: Build a Content-Based Recommendation system that computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

**Dataset**:
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Split the `genres` column into separate genre strings.
   - Fill any missing values in the `genres` column and convert it to string.

2. **Compute TF-IDF Matrix**:
   - Use `TfidfVectorizer` to transform the genres into a TF-IDF matrix.

3. **Calculate Cosine Similarity**:
   - Compute cosine similarity between the TF-IDF matrices of movies.

4. **Recommend Movies**:
   - Create a function that gets movie recommendations based on the cosine similarity score of movie genres.

In [5]:
movies = org_movies.copy(deep=True)
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [7]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [9]:
# Example
get_recommendations('GoldenEye (1995)')

345     Clear and Present Danger (1994)
543           Surviving the Game (1994)
724                    Rock, The (1996)
788                     Daylight (1996)
825               Chain Reaction (1996)
978                 Maximum Risk (1996)
1467                    Anaconda (1997)
1513                     Con Air (1997)
1693                   Firestorm (1998)
3686          Perfect Storm, The (2000)
Name: title, dtype: object


### Collaborative Filtering

**Objective**: Implement a collaborative filtering movie recommendation system to recommend movies to a user based on similar users' spreferences.

**Dataset**:

- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Split `genres` column into separate rows.

2. **Create User-Genre Matrix**:
   - Create a matrix where rows are users and columns are genres.
   - Each cell represents the ratio of movies watched by the user in that genre to the total movies watched by the user.

3. **Calculate User Similarity**:
   - Use cosine similarity to calculate the similarity between users.

4. **Find Top N Similar Users**:
   - Find the top N users with the most similar preferences to the given user.

5. **Recommend Movies**:
   - Recommend movies that similar users have watched but the target user has not.
   - Return titles and genres of the top 10 recommended movies.

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
data = pd.merge(org_ratings, org_movies, on='movie_id')

In [12]:
data.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama


In [13]:
data = data.assign(genres=data['genres'].str.split('|')).explode('genres')

In [14]:
data.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama


In [15]:
grouped = data.groupby(['user_id', 'genres']).size().reset_index(name='count')

In [16]:
grouped.head()

Unnamed: 0,user_id,genres,count
0,1,Action,5
1,1,Adventure,5
2,1,Animation,18
3,1,Children's,20
4,1,Comedy,14


In [55]:
user_genres = grouped.pivot(index='user_id', columns='genres', values='count').fillna(0)
genre_columns = user_genres.columns.difference(['user_id'])
total = user_genres[genre_columns].sum(axis=1)
user_genres[genre_columns] = user_genres[genre_columns].div(total, axis=0)
user_genres.head()

genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.043103,0.043103,0.155172,0.172414,0.120690,0.017241,0.000000,0.181034,0.025862,0.000000,0.000000,0.120690,0.000000,0.051724,0.025862,0.025862,0.017241,0.000000
2,0.194444,0.065972,0.000000,0.000000,0.086806,0.041667,0.000000,0.274306,0.003472,0.003472,0.006944,0.000000,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417
3,0.186992,0.203252,0.024390,0.024390,0.243902,0.000000,0.000000,0.065041,0.016260,0.000000,0.024390,0.008130,0.008130,0.040650,0.048780,0.040650,0.016260,0.048780
4,0.327586,0.103448,0.000000,0.017241,0.000000,0.017241,0.000000,0.103448,0.034483,0.000000,0.051724,0.000000,0.000000,0.034483,0.155172,0.068966,0.051724,0.034483
5,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.000000,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.091070,0.049083,0.020106,0.031934,0.154347,0.031342,0.006505,0.219988,0.011236,0.010053,0.043761,0.018332,0.020106,0.072147,0.099941,0.086931,0.024837,0.008279
6037,0.068627,0.022059,0.002451,0.014706,0.144608,0.044118,0.002451,0.240196,0.009804,0.022059,0.022059,0.009804,0.031863,0.053922,0.095588,0.166667,0.039216,0.009804
6038,0.048780,0.024390,0.073171,0.024390,0.292683,0.000000,0.000000,0.219512,0.000000,0.000000,0.048780,0.000000,0.000000,0.146341,0.024390,0.000000,0.097561,0.000000
6039,0.028881,0.036101,0.046931,0.061372,0.234657,0.007220,0.000000,0.101083,0.018051,0.021661,0.003610,0.151625,0.061372,0.108303,0.028881,0.050542,0.032491,0.007220


In [57]:
user_similarity = cosine_similarity(user_genres)
sim_df = pd.DataFrame(user_similarity, index=user_genres.index.values, columns=user_genres.index.values)
sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
1,1.000000,0.593927,0.526427,0.345999,0.683741,0.804139,0.334192,0.634097,0.706181,0.789703,...,0.773599,0.819834,0.407116,0.187620,0.686592,0.721569,0.630877,0.698349,0.780685,0.689392
2,0.593927,1.000000,0.669034,0.770171,0.926928,0.644824,0.800366,0.951974,0.939869,0.763845,...,0.893203,0.772733,0.840031,0.309405,0.845356,0.916011,0.897834,0.707156,0.563432,0.906981
3,0.526427,0.669034,1.000000,0.665649,0.614157,0.672985,0.678774,0.484249,0.620163,0.862466,...,0.806012,0.724738,0.738403,0.176257,0.801695,0.724560,0.604351,0.687051,0.677912,0.621344
4,0.345999,0.770171,0.665649,1.000000,0.527271,0.376672,0.947039,0.584498,0.572512,0.579480,...,0.668976,0.454955,0.974503,0.165156,0.625503,0.636699,0.561714,0.337779,0.280647,0.510784
5,0.683741,0.926928,0.614157,0.527271,1.000000,0.723022,0.581497,0.949545,0.982931,0.817454,...,0.908946,0.888982,0.619386,0.369649,0.894214,0.963219,0.958866,0.828438,0.699646,0.985915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.721569,0.916011,0.724560,0.636699,0.963219,0.762135,0.655598,0.892141,0.950764,0.909171,...,0.968066,0.924787,0.699436,0.350342,0.965984,1.000000,0.960086,0.846474,0.756300,0.966636
6037,0.630877,0.897834,0.604351,0.561714,0.958866,0.651937,0.641761,0.892023,0.957598,0.801094,...,0.911222,0.845329,0.648755,0.456425,0.900900,0.960086,1.000000,0.757283,0.688530,0.940474
6038,0.698349,0.707156,0.687051,0.337779,0.828438,0.845347,0.349516,0.705967,0.799897,0.881496,...,0.853067,0.923806,0.421861,0.173897,0.859083,0.846474,0.757283,1.000000,0.819834,0.861957
6039,0.780685,0.563432,0.677912,0.280647,0.699646,0.907349,0.337471,0.533223,0.675536,0.874097,...,0.815025,0.866314,0.365294,0.288466,0.782290,0.756300,0.688530,0.819834,1.000000,0.710684


In [58]:
def get_top_n_similar_users(user_id, n=5):
    similar_users = sim_df.loc[user_id].sort_values(ascending=False).iloc[1:n+1].index.tolist()
    return similar_users

In [59]:
def recommend_movies(user_id, n_similar_users=5, n_recommendations=10):
    similar_user_ids = get_top_n_similar_users(user_id, n_similar_users)
    similar_users = data[data['user_id'].isin(similar_user_ids)]

    user_movies = data[data['user_id'] == user_id]['movie_id'].unique()

    recommendations = similar_users[~similar_users['movie_id'].isin(user_movies)]
    tops = recommendations['movie_id'].value_counts().head(n_recommendations).index
    recomms = movies[org_movies['movie_id'].isin(tops)]

    return recomms[['title', 'genres']]

In [60]:
# Example
user_id = 1
recommended_movies = recommend_movies(user_id, n_similar_users=5, n_recommendations=10)

In [61]:
recommended_movies

Unnamed: 0,title,genres
33,Babe (1995),"[""Children's"", 'Comedy', 'Drama']"
900,Casablanca (1942),"['Drama', 'Romance', 'War']"
1019,Alice in Wonderland (1951),"['Animation', ""Children's"", 'Musical']"
1262,Fantasia (1940),"['Animation', ""Children's"", 'Musical']"
1575,L.A. Confidential (1997),"['Crime', 'Film-Noir', 'Mystery', 'Thriller']"
2009,"Jungle Book, The (1967)","['Animation', ""Children's"", 'Comedy', 'Musical']"
2011,Lady and the Tramp (1955),"['Animation', ""Children's"", 'Comedy', 'Musical..."
2012,"Little Mermaid, The (1989)","['Animation', ""Children's"", 'Comedy', 'Musical..."
2018,Peter Pan (1953),"['Animation', ""Children's"", 'Fantasy', 'Musical']"
2027,Sleeping Beauty (1959),"['Animation', ""Children's"", 'Musical']"


### SVD (Singular Value Decomposition)




**Objective**: Implement an SVD-based recommendation system to recommend movies to users by decomposing the user-item interaction matrix into latent factors.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Create a user-item matrix where rows represent users, columns represent movies, and the values are the ratings.

2. **Decompose Matrix using SVD**:
   - Apply Singular Value Decomposition (SVD) to decompose the user-item matrix into three matrices: $U$, $\Sigma$, and $V^T$.

3. **Reconstruct Matrix**:
   - Reconstruct the user-item matrix using the top $k$ singular values to reduce dimensionality.

4. **Predict Ratings**:
   - Use the reconstructed matrix to predict ratings for all user-item pairs.

5. **Recommend Movies**:
   - Recommend the top 20 movies with the highest predicted ratings for a given user that the user hasn't rated yet.

In [62]:
ratings = org_ratings.copy(deep=True)
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 6040 | Number of movies = 3706


Fill na elements.

In [63]:
Ratings = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
Ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# calculate rating matrix

In [65]:
from scipy.sparse.linalg import svds
import numpy as np

U, sigma, Vt = svds(Ratings.to_numpy(), k = 50)
sigma = np.diag(sigma)

In [66]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [67]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns, index=Ratings.index)
preds.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.292556,0.164772,-0.184504,-0.018528,0.021516,-0.181793,-0.102936,0.157111,-0.058523,-0.164231,...,0.032475,0.007048,0.03317,-0.007577,-0.075329,0.394578,0.124714,0.051545,0.057349,0.076743
2,0.754817,0.128981,0.341128,0.00954,0.00183,1.31458,0.077427,0.062218,0.163983,1.514572,...,-0.051376,-0.01578,-0.010918,0.055628,-0.016037,0.166941,-0.421144,-0.106675,-0.04965,-0.126564
3,1.844858,0.473855,0.098573,-0.039309,-0.019895,-0.154038,-0.141531,0.111551,0.035977,0.738427,...,0.049994,0.003596,0.022734,0.043377,0.034825,0.120205,0.086553,0.034783,0.029337,-0.121871
4,0.395484,-0.045487,0.033716,0.08389,0.051561,0.260676,-0.081541,0.023891,0.051932,-0.07717,...,0.010847,0.007518,0.004269,0.009527,-0.07963,0.077845,0.051369,-0.017046,0.01898,-0.04758
5,1.557609,-0.0067,-0.04486,0.249014,-0.04332,1.51814,-0.164306,-0.043626,-0.078771,0.424064,...,0.097708,0.013847,-0.024256,-0.043126,-0.065997,-0.025571,0.517685,0.008508,0.10663,0.221219


In [68]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):

    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False)

    user_data = original_ratings[original_ratings['user_id'] == (userID)]

    # merge movies data with user_data
    user_full = user_data.merge(movies, how = 'inner', on = 'movie_id').sort_values(['rating'], ascending=False)


    print(f'User {userID} has already rated {user_full.shape[0]} movies.')
    print(f'Recommending highest {num_recommendations} predicted ratings movies not already rated.')

    # get all not watched movies
    not_watched_movies = movies[~movies['movie_id'].isin(user_full['movie_id'])]

    # merge the not watched movies and predictions
    preds_df = pd.DataFrame(sorted_user_predictions).reset_index()
    not_watched_merged = pd.merge(preds_df, not_watched_movies, how='inner', on = 'movie_id')

    # sort and return first n recommendations
    sorted_preds = not_watched_merged.sort_values(userID, ascending = False)
    recommendations = sorted_preds.iloc[:num_recommendations, :-1]


    return user_full, recommendations

In [69]:
already_rated, predictions = recommend_movies(preds, 4375, org_movies, ratings, 20)

User 4375 has already rated 325 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [70]:
predictions.head(20)

Unnamed: 0,movie_id,4375,title
0,539,3.763169,Sleepless in Seattle (1993)
1,1036,3.475342,Die Hard (1988)
2,2302,3.104131,My Cousin Vinny (1992)
3,1682,2.990546,"Truman Show, The (1998)"
4,1219,2.98969,Psycho (1960)
5,1645,2.935043,"Devil's Advocate, The (1997)"
6,1704,2.851534,Good Will Hunting (1997)
7,3247,2.819256,Sister Act (1992)
8,1625,2.769953,"Game, The (1997)"
9,3107,2.681851,Backdraft (1991)


In [71]:
# Top 20 movies that User 4375 has rated
already_rated.head(20)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
304,4375,2797,5,4374,2796,Big (1988),Comedy|Fantasy
127,4375,1961,5,4374,1960,Rain Man (1988),Drama
125,4375,349,5,4374,348,Clear and Present Danger (1994),Action|Adventure|Thriller
109,4375,2533,5,4374,2532,Escape from the Planet of the Apes (1971),Action|Sci-Fi
122,4375,3552,5,4374,3551,Caddyshack (1980),Comedy
121,4375,14,5,4374,13,Nixon (1995),Drama
307,4375,527,5,4374,526,Schindler's List (1993),Drama|War
119,4375,318,5,4374,317,"Shawshank Redemption, The (1994)",Drama
118,4375,1923,5,4374,1922,There's Something About Mary (1998),Comedy
309,4375,2004,5,4374,2003,Gremlins 2: The New Batch (1990),Comedy|Horror


#### Evaluation

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

train_data, test_data = train_test_split(org_ratings, test_size=0.30, random_state=31)

train_ratings = train_data.copy(deep=True).pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
R_train = train_ratings.values
user_ratings_mean_train = np.mean(R_train, axis = 1)
Ratings_demeaned_train = R_train - user_ratings_mean_train.reshape(-1, 1)

U_train, sigma_train, Vt_train = svds(R_train, k = 50)
sigma_train = np.diag(sigma_train)

all_user_predicted_ratings_train = np.dot(np.dot(U_train, sigma_train), Vt_train)
preds_train = pd.DataFrame(all_user_predicted_ratings_train, columns = train_ratings.columns, index=train_ratings.index)

def predict_rating(user_id, movie_id):
    try:
      pred = preds_train.loc[user_id][movie_id]
      return pred
    except:
      return 0
test_data['predicted_rating'] = test_data.apply(lambda x: predict_rating(x['user_id'], x['movie_id']), axis=1)

test_data.dropna(inplace=True)

rmse = sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 2.890406184182697


### Nueral Network Model (Recommender Model)



**Objective**: Implement a Recommender model to recommend movies to a user based on similar users' preferences.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Define Dataset and DataLoader**:
   - Create a custom PyTorch `Dataset` for ratings.
   - Create a DataLoader for batching and shuffling data.

2. **Define the Neural Network**:
   - Create a neural network with embedding layers for users and movies.

3. **Train the Model**:
   - Train the model using Mean Squared Error loss and Adam optimizer.
   - Save model checkpoints.

4. **Evaluate the Model**:
   - Calculate RMSE on the entire dataset.

5. **Predict Ratings for Unrated Movies**:
   - Predict and recommend top 10 unrated movies for a given user.

In [73]:
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from math import sqrt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [74]:
ratings = org_ratings.copy(deep=True)
unique_users = ratings['user_id'].unique()
unique_movies = ratings['movie_id'].unique()
user_to_index = {user: idx for idx, user in enumerate(unique_users)}
movie_to_index = {movie: idx for idx, movie in enumerate(unique_movies)}
ratings['user_id'] = ratings['user_id'].map(user_to_index)
ratings['movie_id'] = ratings['movie_id'].map(movie_to_index)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,0,0,5,0,1192
1,0,1,3,0,660
2,0,2,3,0,913
3,0,3,4,0,3407
4,0,4,5,0,2354


In [75]:
# Define the dataset
class RatingsDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.ratings.iloc[idx]['user_id'], self.ratings.iloc[idx]['movie_id'], self.ratings.iloc[idx]['rating']

# Define the neural network
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=10):
        super(RecommenderNet, self).__init__()
        # Embeddings for users and movies
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

        self.fc = nn.Linear(embedding_size * 2, 1)

    def forward(self, user, movie):
        # Get embeddings for users and movies
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)
        x = torch.cat([user_emb, movie_emb], dim=1)

        x = self.fc(x)

        return x.squeeze()

# Create the dataset and dataloader
train_data, val_data = train_test_split(ratings, test_size=0.2, random_state=42)
train_dataset = RatingsDataset(train_data)
val_dataset = RatingsDataset(val_data)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function and optimizer
model = RecommenderNet(len(unique_users), len(unique_movies), embedding_size=50)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):

# Train the model
  model.train()
  train_loss = 0.0

  for user, movie, rating in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
      optimizer.zero_grad()
      prediction = model(user.long().to(device), movie.long().to(device))
      loss = criterion(prediction, rating.float().to(device))
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

  train_loss /= len(train_loader)

# Evaluation
  model.eval()
  val_loss = 0.0
  val_acc = 0.0
  num_users = 0

  with torch.no_grad():
      for user, movie, rating in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
        prediction = model(user.long().to(device), movie.long().to(device))
        loss = criterion(prediction, rating.float().to(device))
        val_loss += loss.item()

        actual_ratings = rating.cpu().numpy()
        predicted_ratings = prediction.cpu().numpy()

        val_acc += sum(1 if round(p) == a else 0 for (a, p) in zip(rating, prediction.tolist()))
        num_users += batch_size

  val_loss /= len(val_loader)
  val_acc /= num_users
  val_rmse = sqrt(val_loss)

  print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
  print(f"Validation RMSE: {val_rmse:.4f}, Validation Accuracy: {val_acc:.4f}")


Epoch 1/3 - Training: 100%|██████████| 12503/12503 [02:14<00:00, 92.83it/s] 
Epoch 1/3 - Validation: 100%|██████████| 3126/3126 [00:30<00:00, 103.95it/s]


Epoch 1/3, Train Loss: 1.0600, Validation Loss: 0.8922
Validation RMSE: 0.9446, Validation Accuracy: 0.3986


Epoch 2/3 - Training: 100%|██████████| 12503/12503 [02:12<00:00, 94.03it/s] 
Epoch 2/3 - Validation: 100%|██████████| 3126/3126 [00:31<00:00, 99.14it/s] 


Epoch 2/3, Train Loss: 0.8645, Validation Loss: 0.8679
Validation RMSE: 0.9316, Validation Accuracy: 0.4037


Epoch 3/3 - Training: 100%|██████████| 12503/12503 [02:13<00:00, 93.90it/s] 
Epoch 3/3 - Validation: 100%|██████████| 3126/3126 [00:30<00:00, 102.96it/s]

Epoch 3/3, Train Loss: 0.8574, Validation Loss: 0.8598
Validation RMSE: 0.9272, Validation Accuracy: 0.4138





In [76]:
# Example

sample_user_id = 1
sample_user_data = ratings[ratings['user_id'] == user_to_index[sample_user_id]]
sample_dataset = RatingsDataset(sample_user_data)
sample_dataloader = DataLoader(sample_dataset, batch_size=1, shuffle=False)

print(f'Sample predictions for user ID {sample_user_id}:')
with torch.no_grad():
    for user, movie, rating in sample_dataloader:
        output = model(user.to(device), movie.to(device)).squeeze()
        print(f'Movie ID: {movie.tolist()[0]}, Predicted Rating: {output.item()}, Actual Rating: {rating.item()}')

Sample predictions for user ID 1:
Movie ID: 0, Predicted Rating: 4.646676063537598, Actual Rating: 5
Movie ID: 1, Predicted Rating: 3.732640504837036, Actual Rating: 3
Movie ID: 2, Predicted Rating: 4.447811603546143, Actual Rating: 3
Movie ID: 3, Predicted Rating: 4.253232479095459, Actual Rating: 4
Movie ID: 4, Predicted Rating: 4.137252330780029, Actual Rating: 5
Movie ID: 5, Predicted Rating: 4.519640922546387, Actual Rating: 3
Movie ID: 6, Predicted Rating: 4.264586925506592, Actual Rating: 5
Movie ID: 7, Predicted Rating: 4.600958347320557, Actual Rating: 5
Movie ID: 8, Predicted Rating: 4.372879505157471, Actual Rating: 4
Movie ID: 9, Predicted Rating: 4.725480556488037, Actual Rating: 4
Movie ID: 10, Predicted Rating: 4.240516185760498, Actual Rating: 5
Movie ID: 11, Predicted Rating: 3.9511642456054688, Actual Rating: 4
Movie ID: 12, Predicted Rating: 4.1490583419799805, Actual Rating: 4
Movie ID: 13, Predicted Rating: 4.374333381652832, Actual Rating: 4
Movie ID: 14, Predicte

In [77]:
def predict_unrated_movies(user_id, model, ratings, movies):
    # merge movies data with user_data
    user_data = ratings[ratings['user_id'] == (user_id)]
    user_full = user_data.merge(movies, how = 'inner', on = 'movie_id').sort_values(['rating'], ascending=False)

    # get all not watched movies
    not_watched_movies = movies[~movies['movie_id'].isin(user_full['movie_id'])]

    not_watched_movies['user_id'] = [user_to_index[user_id] for _ in range(len(not_watched_movies['movie_id']))]
    not_watched_movies['rating'] = [0 for _ in range(len(not_watched_movies['movie_id']))]
    not_watched_movies['movie_id'] = not_watched_movies['movie_id'].map(movie_to_index)

    not_watched_movies = not_watched_movies.reset_index().dropna(how='any',axis=0)
    not_watched_movies['movie_id'] = not_watched_movies['movie_id'].astype(int)

    sample_dataset = RatingsDataset(not_watched_movies)
    sample_dataloader = DataLoader(sample_dataset, batch_size=1, shuffle=False)

    res = {}
    with torch.no_grad():
      for user, movie, rating in sample_dataloader:
          output = model(user.to(device), movie.to(device)).squeeze()
          res[movie.tolist()[0]] = output.item()

    not_watched_movies['predicted_rating'] = not_watched_movies['movie_id'].map(res)

    return not_watched_movies.reset_index()

# Example
user_id = 1
predictions = predict_unrated_movies(user_id, model, ratings, org_movies)
print(predictions.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_watched_movies['user_id'] = [user_to_index[user_id] for _ in range(len(not_watched_movies['movie_id']))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_watched_movies['rating'] = [0 for _ in range(len(not_watched_movies['movie_id']))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_watch

   level_0  index  movie_id                               title  \
0        0      0        40                    Toy Story (1995)   
1        1      1       573                      Jumanji (1995)   
2        2      2      1333             Grumpier Old Men (1995)   
3        3      3       450            Waiting to Exhale (1995)   
4        4      4      1334  Father of the Bride Part II (1995)   

                         genres  user_id  rating  predicted_rating  
0   Animation|Children's|Comedy        0       0          4.227804  
1  Adventure|Children's|Fantasy        0       0          3.433202  
2                Comedy|Romance        0       0          3.146544  
3                  Comedy|Drama        0       0          3.064422  
4                        Comedy        0       0          3.503864  


### GMM (Gaussian Mixture Model)

**Objective**: Use a Gaussian Mixture Model to analyze and cluster the click data based on the number of clicks from different locations, aiming to identify distinct patterns of user behavior across 10 countries.

**Dataset**:
- **Click Data**: DataFrame with columns `link_id`, `location`, and `number_of_clicks`.

**Steps**:

1. **Preprocess Data**:
   - Ensure the dataset contains 10 distinct countries.
   - Create a matrix where rows represent different links and columns represent the number of clicks from each country.
   - Normalize the number of clicks to account for different scales.

2. **Fit GMM**:
   - Apply a Gaussian Mixture Model (GMM) to the click data matrix to identify clusters of links with similar click patterns across different countries.

3. **Evaluate Model**:
   - Analyze the resulting model by calculating the log lokelihood, BIC and AIC metrics.


In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/MIR-Phase3-2/gmm-dataset.csv'
df = pd.read_csv(file_path)

X = df.drop(columns=['URL'])

gmm = GaussianMixture(n_components=3, random_state=0)
gmm.fit(X)
df['Cluster'] = gmm.predict(X)
df.reset_index()
print(df.head())

log_likelihood = gmm.score(X) * X.shape[0]
bic = gmm.bic(X)
aic = gmm.aic(X)

print(f"Log-Likelihood: {log_likelihood}")
print(f"BIC: {bic}")
print(f"AIC: {aic}")