In [1]:
import os
import numpy as np
import numpy.random as rd

# read train set and qulifying set 
Xt = np.genfromtxt("train.csv", delimiter=",", dtype=np.int)
Xq = np.genfromtxt("qualifying.csv", delimiter=",", dtype=np.int)
print(Xt)

[[   2  566    1]
 [   2  750    2]
 [   2  391    1]
 ...
 [5498  565    2]
 [5498  648    2]
 [5498  651    2]]


In [2]:
# data description
import matplotlib.pyplot as plt
List = [Xt[:,2][Xt[:,2]==i].shape[0] for i in range (np.max(Xt[:,2]+1))]
print('There are so much number from 0 to 4 respectively:')
print(List)
plt.bar(range(len(List)),List)
plt.xlabel('Rating'); plt.ylabel('number of data')
plt.show()

There are so much number from 0 to 4 respectively:
[335499, 50169, 29623, 15138, 4212]


<matplotlib.figure.Figure at 0x10866a4a8>

In [3]:
# data preparation
from sklearn import model_selection as ms
from scipy.sparse import coo_matrix
import pandas as pd   

Xt= Xt + np.array([0,0,1])
train_data,test_data=ms.train_test_split(Xt,test_size=0.1,random_state = 1)

In [4]:
# define unconstrained matrix factorization class
import random
class matix_factorization:
    def __init__(self,mat,K=20):
        self.mat=np.array(mat)
        self.K=K
        self.bi={}
        self.bu={}
        self.qi={}
        self.pu={}
        self.avg=np.mean(self.mat[:,2])
        for i in range(self.mat.shape[0]):
            uid=self.mat[i,0]
            iid=self.mat[i,1]
            self.bi.setdefault(iid,0)
            self.bu.setdefault(uid,0)
            self.qi.setdefault(iid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            self.pu.setdefault(uid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            
    # define functin about the prediction of rating
    def predict(self,uid,iid): 
        # if the uid or iid didn't appear before, then set the mapped value of relevant uid and iid
        # in bi,bu,qi,pu as 0.
        self.bi.setdefault(iid,0)
        self.bu.setdefault(uid,0)
        self.qi.setdefault(iid,np.zeros((self.K,1)))
        self.pu.setdefault(uid,np.zeros((self.K,1)))
        # the formula of prediction
        rating=self.avg+self.bi[iid]+self.bu[uid]+np.sum(self.qi[iid]*self.pu[uid]) 
        # after plus 1 for every known rating, the rating range will be between 1 to 5 (including 1 and 5)
        # So if the result over than 5, return 5; or lower than 1, return 1.
        if rating>5:
            rating=5
        if rating<1:
            rating=1
        return rating
    
    def train(self,steps=35,gamma=0.04,Lambda=0.05):    # train the model, step is the number of loops
        preRmse = 1000000000.0
        print('train data size',self.mat.shape)
        for step in range(steps):
            print('step',step+1,'is running')           
            KK=np.random.permutation(self.mat.shape[0]) #kk is random shuffling of the matrix
            rmse=0.0
            for i in range(self.mat.shape[0]):          #Stochastic Gradient Descent Aglo
                j=KK[i]
                uid=self.mat[j,0]
                iid=self.mat[j,1]
                rating=self.mat[j,2]
                eui=rating-self.predict(uid, iid)
                rmse+=eui**2
                self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
                self.bi[iid]+=gamma*(eui-Lambda*self.bi[iid])
                tmp=self.qi[iid]
                self.qi[iid]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[iid])
                self.pu[uid]+=gamma*(eui*tmp-Lambda*self.pu[uid])
            nowRmse=np.sqrt(rmse*1.0/self.mat.shape[0])
            print('rmse is', nowRmse)
            if(round(nowRmse,3)<preRmse):              
                preRmse=round(nowRmse,3)
            else:                                      # when Rmse is no longer reduced, 
                break                                  # the minimal RMSE is founded and stop the loop.
            gamma *= 0.93                              # the learning rate of gamma is 0.93

    # compute rating for qualifying.csv (the rating range is between 0 to 4.)
    def r_hat(self,test_data):                         
        pred=[0 for i in range(test_data.shape[0])]
        for i in range (test_data.shape[0]):
            uid=test_data[i,0]
            iid=test_data[i,1]
            pred[i]=a.predict(uid,iid)-1
        return pred
    
    # compute rating for test_data (rating range: 1 to 5) and corresponding RMSE
    def test(self,test_data):          
        test_data=np.array(test_data)
        print('test data size',test_data.shape)
        rmse=0.0
        for i in range(test_data.shape[0]):
            uid=test_data[i,0]
            iid=test_data[i,1]
            rating=test_data[i,2]
            eui=rating-self.predict(uid, iid)
            rmse+=eui**2
        print('rmse of test data is',np.sqrt(rmse/test_data.shape[0]))


In [5]:
# for train data and test data 
a=matix_factorization(train_data,50)  
a.train()
print('')
a.test(test_data)

train data size (391176, 3)
step 1 is running
rmse is 0.7477200152187443
step 2 is running
rmse is 0.5568192323509193
step 3 is running
rmse is 0.5197352942850387
step 4 is running
rmse is 0.5010838777801487
step 5 is running
rmse is 0.489802586178501
step 6 is running
rmse is 0.4824768484599918
step 7 is running
rmse is 0.47693803153933506
step 8 is running
rmse is 0.4730336355232131
step 9 is running
rmse is 0.4694341326792978
step 10 is running
rmse is 0.46729818775024395
step 11 is running
rmse is 0.46507294057530013
step 12 is running
rmse is 0.46312362689229053
step 13 is running
rmse is 0.46155652285202486
step 14 is running
rmse is 0.4602051645413848
step 15 is running
rmse is 0.45910736955135595
step 16 is running
rmse is 0.45812971290424176
step 17 is running
rmse is 0.4570567189140238
step 18 is running
rmse is 0.45605677319635596
step 19 is running
rmse is 0.45538905677141384
step 20 is running
rmse is 0.4545511814693519

test data size (43465, 3)
rmse of test data is 0.507

In [6]:
#train the whole train.csv
t=matix_factorization(Xt,50)  
t.train()

#for qualifying
print('')
pred=t.r_hat(Xq)
pred=np.array(pred)
pred=pred[:,np.newaxis]
print(pred)
print(pred.shape)

res=np.append(Xq,pred,axis=1)
print(res[6])

# save in right format
np.savetxt("model_based.csv", res, delimiter=",", newline="\n", encoding="utf-8") 

train data size (434641, 3)
step 1 is running
rmse is 0.7333669485948618
step 2 is running
rmse is 0.5520374591488721
step 3 is running
rmse is 0.5174092459822195
step 4 is running
rmse is 0.5004862469374596
step 5 is running
rmse is 0.49008735823538646
step 6 is running
rmse is 0.4836747825745926
step 7 is running
rmse is 0.4794222770760471
step 8 is running
rmse is 0.4752971645787696
step 9 is running
rmse is 0.4726829664210143
step 10 is running
rmse is 0.47045329162357646
step 11 is running
rmse is 0.46845727813025184
step 12 is running
rmse is 0.46692472153300435
step 13 is running
rmse is 0.46525925815724384
step 14 is running
rmse is 0.46448483907241295
step 15 is running
rmse is 0.46342109593653447
step 16 is running
rmse is 0.4621408174639017
step 17 is running
rmse is 0.46137857197907695
step 18 is running
rmse is 0.46063430613636575

[[0.        ]
 [0.17166415]
 [0.        ]
 ...
 [1.72524818]
 [0.28680735]
 [0.12489671]]
(108660, 1)
[1898.  742.    0.]
