# DTSA 5510, Week 4 assignment, part 2

In [39]:
from collections import namedtuple

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 

In [16]:
# i downloaded the data from the kaggle link on the week 3 assignment
MV_movies = pd.read_csv('data/movies.dat', 
                         delimiter='::', 
                         names=['MovieID', 'Title', 'Genres'], 
                         engine='python',
                         encoding='ISO-8859-1')

ratings = pd.read_csv('data/ratings.dat', 
                      delimiter='::', 
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'], 
                      engine='python', 
                      encoding='ISO-8859-1')


MV_users = pd.read_csv('data/users.dat', 
                       delimiter='::', 
                       names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], 
                       engine='python', 
                       encoding='ISO-8859-1')

In [18]:
MV_movies.to_csv('data/movies.csv', index = False)
MV_users.to_csv('data/users.csv', index = False)
ratings.to_csv('data/ratings.csv', index = False)

In [25]:
print(MV_movies.info(), '\n\n')
print(MV_users.info(), '\n\n')
print(ratings.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB
None 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID 

In [74]:
MV_users = MV_users.dropna()
MV_users.rename(columns = {'UserID': 'uID'}, inplace = True)

MV_movies = MV_movies.dropna()
MV_movies.rename(columns = {'MovieID': 'mID'}, inplace = True)

ratings = (ratings.rename(columns = {
    'UserID': 'uID',
    'MovieID': 'mID',
    'Rating': 'rating'
})
          .fillna(0))



In [75]:
Data = namedtuple('Data', ['users','movies'])
data = Data(MV_users, MV_movies)
data

Data(users=       uID Gender  Age  Occupation Zip-code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455
...    ...    ...  ...         ...      ...
6035  6036      F   25          15    32603
6036  6037      F   45           1    76006
6037  6038      F   56           1    14706
6038  6039      F   45           0    01060
6039  6040      M   25           6    11106

[6040 rows x 5 columns], movies=       mID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (19

In [76]:
MV_users_train, MV_users_test = (train_test_split(MV_users, 
                                                  test_size = 0.3, 
                                                  random_state = 5510))

MV_movies_train, MV_movies_test = (train_test_split(MV_movies, 
                                                    test_size = 0.3, 
                                                    random_state = 5510))

train = (pd.concat([MV_users_train.reset_index(drop = True), 
                    MV_movies_train.reset_index(drop = True)], 
                   axis=1))

test = (pd.concat([MV_users_test.reset_index(drop = True), 
                   MV_movies_test.reset_index(drop = True)], 
                   axis=1))

In [77]:
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [82]:
# Create the rating matrix
from scipy.sparse import coo_matrix

# Create a mapping of user IDs and movie IDs to indices
mid2idx = dict(zip(MV_movies.mID, range(len(MV_movies))))
uid2idx = dict(zip(MV_users.uID, range(len(MV_users))))

# Create the rating matrix
ind_movie = [mid2idx[mID] for mID in ratings.mID]
ind_user = [uid2idx[uID] for uID in ratings.uID]
rating_values = ratings.rating.values

# Create a sparse matrix from the rating data
rating_matrix = coo_matrix((rating_values, (ind_user, ind_movie)), shape=(len(MV_users), len(MV_movies))).toarray()


from sklearn.decomposition import NMF

# Initialize and fit the NMF model
n_components = 18  # You can adjust this number based on your needs
model = NMF(n_components=n_components, random_state=42, init='nndsvda', solver='mu', beta_loss='kullback-leibler', max_iter=500)
W = model.fit_transform(rating_matrix)  # User features
H = model.components_  # Movie features

# Predict ratings
predicted_ratings = np.dot(W, H)

test_user_indices = [uid2idx[uid] for uid in data.test.uID]
test_movie_indices = [mid2idx[mid] for mid in data.test.mID]

# Get predicted ratings for the test set
predicted_test_ratings = predicted_ratings[test_user_indices, test_movie_indices]

# Get actual test ratings
actual_test_ratings = data.test.rating.values

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_test_ratings, predicted_test_ratings))

# Output the RMSE
print(f'RMSE for the NMF predictions: {rmse}')


KeyError: nan

In [80]:
# copied from week 3 homework
class RecSys():
    def __init__(self, data):
        self.data = data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.mid2idx = dict(zip(self.data.movies.mID, range(len(self.data.movies))))
        self.uid2idx = dict(zip(self.data.users.uID, range(len(self.data.users))))
        self.Mr = self.rating_matrix()

    def rating_matrix(self):
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID]
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), 
                                    shape=(len(self.allusers), len(self.allmovies))).toarray())

    def predict_from_nmf(self, X):
        """
        Predict ratings using the reconstructed user data from NMF.
        """
        yhat = []
        for i in range(len(self.data.test)):
            x = self.data.test.iloc[i]
            mid = x.mID
            uid = x.uID
            yhat.append(X[self.uid2idx[uid], self.mid2idx[mid]])
        return np.array(yhat)

    def rmse(self, yhat):
        yhat[np.isnan(yhat)] = 3  # Filling NaNs
        labs = np.array(self.data.test.rating)
        return np.sqrt(((labs - yhat) ** 2).mean())

# Perform NMF on ratings matrix
rs = RecSys(data)
rating_matrix = rs.Mr
model = (NMF(n_components=18, random_state=42, init="nndsvda", 
              solver="mu", beta_loss="kullback-leibler", max_iter=1000)
         .fit(ratings_matrix))
W = model.transform(rating_matrix)
H = model.components_

# Reconstruct user data as predictions from NMF
X = model.inverse_transform(W)

# Get predictions for the test data
yhat = rs.predict_from_nmf(X)

# Calculate RMSE
RMSE = rs.rmse(yhat)
print("The RMSE of the predictions made using NMF was:", RMSE)


KeyError: nan

In [57]:
# Initialize the recommender system
rs = RecSys(data)

# Extract the rating matrix
ratings_matrix = rs.Mr

# Fit the NMF model
model = (NMF(n_components=18, 
              random_state=5510, 
              init='nndsvda', 
              solver='mu', 
              beta_loss='kullback-leibler', 
              max_iter=500)
         .fit(ratings_matrix))

# Transform the ratings matrix to get W and H
W = model.transform(ratings_matrix)
H = model.components_

# Predict ratings for all users and movies
predicted_ratings = np.dot(W, H)

# Create a DataFrame for predicted ratings
predicted_df = pd.DataFrame(predicted_ratings, columns=data.movies.mID, index=data.users.uID)

# Melt the DataFrame to get it in long format
predicted_long = predicted_df.reset_index().melt(id_vars='uID', value_name='predicted_rating', var_name='mID')

# Merge with the original training data to fill in the missing ratings
merged_df = data.train.merge(predicted_long, on=['uID', 'mID'], how='left', suffixes=('', '_pred'))

# Fill missing ratings with predicted ratings
merged_df['rating_filled'] = merged_df['rating'].fillna(merged_df['predicted_rating'])

# Now, to evaluate the predictions on the test set
# Get indices for the test set
test_user_indices = [rs.uid2idx[uid] for uid in data.test.uID]
test_movie_indices = [rs.mid2idx[mid] for mid in data.test.mID]

# Extract predicted ratings for the test set
predicted_test_ratings = predicted_ratings[test_user_indices, test_movie_indices]

# Actual test set ratings
actual_test_ratings = data.test.rating.values

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_test_ratings, predicted_test_ratings))
print(f'RMSE for the NMF predictions: {rmse}')

KeyError: nan