In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import warnings
warnings.filterwarnings("ignore")

from sklearn.impute import KNNImputer
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load files
df_rat = pd.read_csv('../data/ratings.csv')
df_mov = pd.read_csv('../data/movies.csv')

In [3]:
# Dealing with Duplicates
df_mov.drop_duplicates(subset=['title'], keep='first', inplace=True)

In [4]:
# Merge movies and ratings csv
df = pd.merge(df_rat, df_mov , on='movieId', how='inner')

In [5]:
# Non-Negative Matrix Factorization
# Non-Negative Matrix Factorization
R = df.pivot_table(index ='userId', columns ='movieId', values ='rating')
R.fillna(R.mean(), inplace=True)

In [6]:
%%time
model = NMF(n_components=1000, init="random", random_state=10)
model.fit(R)

Wall time: 1h 6min 9s


NMF(init='random', n_components=1000, random_state=10)

In [7]:
model.components_

array([[6.84785535e-02, 1.05975102e-02, 2.09585342e-02, ...,
        8.19053403e-02, 8.57659072e-02, 9.61214695e-02],
       [1.14275413e-02, 1.11969493e-01, 8.43206062e-02, ...,
        2.79486271e-02, 3.70072970e-04, 4.62506381e-02],
       [1.34921437e-02, 6.00985997e-02, 0.00000000e+00, ...,
        7.59772696e-03, 4.97053459e-03, 4.44125266e-02],
       ...,
       [0.00000000e+00, 0.00000000e+00, 8.66672988e-01, ...,
        1.98338363e-01, 2.65001619e-01, 4.65928346e-04],
       [3.92990526e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.94885037e-01, 3.16534843e-01, 2.06310434e-02],
       [5.36011845e-01, 0.00000000e+00, 1.20623740e+00, ...,
        2.13387602e-01, 1.21276037e-01, 0.00000000e+00]])

In [8]:
Q = model.components_
P = model.transform(R)

In [9]:
Rhat = np.dot(P, Q)

In [10]:
model.reconstruction_err_

11.988878995363423

In [12]:
import pickle 

saved_model = pickle.dumps(model)
open('nmf_model.bin', 'wb').write(saved_model)

77752557