In this project we will applies the Non-negative matrix factorisation, to create a recomendation system working on a subset of Book-Crossing Dataset which http://www2.informatik.uni-freiburg.de/~cziegler/BX/


In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [7]:

Books = pd.read_csv('Books.csv',sep=';')
Users = pd.read_csv('Users.csv',sep=";")
Ratings = pd.read_csv('BX_Ratings.csv',sep=";",encoding="latin1")
Books=Books.drop(["Image-URL-S","Image-URL-M","Image-URL-L","Publisher","Year-Of-Publication"],axis=1)
Ratings=Ratings.drop(Ratings[Ratings["ISBN"].isin(["0373761619","0735201994",
                            "0330482750","0413326608","0440500702","0373166982",
                            "0894805959","8423920143","034050823X","039482492X",
                            "0553570722","096401811X","085409878X","1874100055",
                            "0006479839","0807735132","0394720784","0723245827",
                            "1581801653","006263545X"])].index)
Books_rate = Ratings.merge(Books,how="left",on="ISBN")
Books_rate = Books_rate.dropna()


In [8]:
U=Books_rate.groupby("User-ID")["Book-Rating"].count()
U=U.loc[U>300].index.values
B=Books_rate.groupby("ISBN")["Book-Rating"].count()
B=B.loc[B>100].index.values
F=Books_rate.loc[Books_rate["User-ID"].isin(U)&Books_rate["ISBN"].isin(B)]

In [326]:
Xnan = pd.pivot_table(F,index="User-ID",columns="ISBN",values="Book-Rating")
X=Xnan.fillna(0)
X.shape

(497, 717)

In [327]:
def norm_frob(A) :
    A=np.asmatrix(A)
    return float(np.sqrt((A.T@A).trace()))
    
def NMF_gen(X,r,alpha):
    X_fact=NMF(n_components=r,solver='mu',beta_loss='frobenius',alpha=alpha)
    W=X_fact.fit_transform(X)
    H=X_fact.components_
    X_hat=pd.DataFrame(W@H)
    X_hat[X_hat>10]=10
    X_hat[X_hat<1]=1
    X_hat.columns=X.columns
    X_hat.index=X.index
    return X_hat

def n_remove(X,n_obs):
    Y=X
    X_1=X
    X_1=(X_1!=0)
    N_zero=pd.DataFrame(np.asmatrix(np.where(X_1)).T,columns=["i","j"])
    obs=N_zero.sample(n_obs)
    for i in range(len(obs)):
        Y.iloc[(obs.iloc[i]["i"],obs.iloc[i]["j"])]=0
    return Y,obs

def rass(X,Y,obs):
    err=0
    for i in range(len(obs)):
        err=err+(X.iloc[(obs.iloc[i]["i"],obs.iloc[i]["j"])]-Y.iloc[(obs.iloc[i]["i"],obs.iloc[i]["j"])])**2
    return err

def range_err(X,vect_r,vect_alpha,n_obs):
    n_rem=n_remove(X,n_obs)
    X_1,obs=n_rem[0],n_rem[1]
    ERR=np.zeros((len(vect_r),len(vect_alpha)))
    for i in range(len(vect_r)):
        for j in range(len(vect_alpha)):
            X_hat=NMF_gen(X_1,vect_r[i],vect_alpha[j])
            ERR[i,j]=rass(X,X_hat,obs)
    return ERR

def RMSE(X,vect_alpha,vect_r,n):
    ERR=np.zeros((len(vect_r),len(vect_alpha),n))
    for i in range(n):
        ERR[:,:,i]=range_err(X,vect_r,vect_alpha,100)
    RMSE=np.zeros((len(vect_r),len(vect_alpha)))
    for i in range(len(vect_r)):
        for j in range(len(vect_alpha)):
            RMSE[i,j]=np.sqrt(np.mean(ERR[i,j,:]))
    RMSE=pd.DataFrame(RMSE,index=vect_r,columns=vect_alpha)
    arg=np.unravel_index(np.argmin(RMSE),RMSE.shape)
    print("the argmin of RMSE in this range are: r =",RMSE.index[arg[0]],"alpha =",RMSE.columns[arg[1]])
    return RMSE,RMSE.index[arg[0]],RMSE.columns[arg[1]]

In [328]:
vect_r=range(4,12,1)
vect_alpha=np.linspace(0.1,2,10)
R=RMSE(X,vect_alpha,vect_r,20)

the argmin of RMSE in this range are: r = 4 alpha = 2.0


In [332]:
fig = go.Figure(go.Surface(colorscale="reds",
    x = np.asarray(vect_alpha),
    y = np.asarray(vect_r),
    z = np.asarray(R[0])))
fig.update_traces(contours_z=dict(show=True, usecolormap=True, project_z=True))
fig.update_layout(scene = dict(
                    xaxis_title='alpha',
                    yaxis_title='r',
                    zaxis_title='RMSE'),
                    width =400,
                    height=400,
                    margin=dict(r=0, b=0, l=0, t=0))
fig.show()


In [349]:
X_hat=round(NMF_gen(X,R[1],R[2]))
X_hat


ISBN,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,006019491X,0060199652,0060391626,0060392452,...,1558744630,1558745157,1559029838,1573225517,1573225789,1573227331,1573229326,1573229571,1592400876,1878424319
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
189835,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
104636,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
248718,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
225810,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
146348,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
274308,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
187145,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
85526,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
254465,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
167349,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [357]:
X_hat_b=X_hat>5
pd.DataFrame(np.asmatrix(np.where(X_hat_b)).T,columns=["i","j"])

Unnamed: 0,i,j
0,12,0
1,12,1
2,12,3
3,12,5
4,12,6
...,...,...
439,465,531
440,465,532
441,465,534
442,465,568
