In [None]:

"""
Colby Wise
Matrix Factorization

Factorizers a N x M user:item matrix into
U: N x r and V: r X M matrices where:

- U - User : feature matrix
- V - Movie: feature matrix

Calculates mean squared error (MSE) and
mean reciprocal rank (MMR) on test data
after the training data decomposition
"""

In [198]:
import pandas as pd
import numpy as np
import time
import pickle
import math

from preprocessing import *
from model_evaluation import *

In [199]:
"""
Class MatrixFactorize takes train and test data then
factors the training data in U,V matrix decompositions.
Using U,V it then predicts movie ratings on the test
data returning the MSE and MRR on test data
"""
class MatrixFactorize(object):
    
    """
    Initialization
    @param:
        train - pandas dataframe of training data
        test - pandas dataframe of test data
        lr  - learning rate
        r - features to learn
        epoch - epochs to run training 
        lambd - regularization rate (lambda)

    """
    def __init__(self, train, test, lr, r, iters, lambd):
        self.lr = lr
        self.features = r
        self.iters = iters
        self.lambd = lambd
        self.train = train
        self.test = test
        
        all_data = pd.concat([train, test])
        self.n_items = len(all_data['movieId'].unique())
        self.n_users = len(all_data['userId'].unique())
        print("Number of users:", self.n_users)
        print("Number of movies:", self.n_items)  
        
        self.U = np.random.randn(self.n_users, self.features)
        self.V = np.random.randn(self.features, self.n_items)
        
        self.loss_record = {"train": [], "test": [], "epoch": [],
                            "r": self.features,"lr" : self.lr}
    """
    Updates loss dictionary that captures training/test
    MSE during training 
    @param:
        test_mse - MSE from test data
        train_mse - MSE from train data
        epoch - current epoch of training
    @Return:
        None
    """
    def record_loss(self, test_mse, train_mse, epoch):
        self.loss_record["train"].append(train_mse)
        self.loss_record["test"].append(test_mse)
        self.loss_record["epoch"].append(epoch)
    
    """
    Predict rating using current U, V matrices
    @param:
        test_sample - random sample from test data
        U - User matrix
        V - Movie matrix
        show - default(False), prints subset of predict values
    @Return:
        mse - mean squared error for test sample
    """ 
    def predict(self, test_sample, U, V, show=False):
        preds = []
        loss = 0
        print("Running test validation...")
        cntr = 1
        for row in test_sample.itertuples():
            user, movie, rating = row[2], row[3], row[4]
            pred = np.dot(U[user,:], np.transpose(V[:,movie]) )
            preds.append(pred) 
            
            if not (cntr % 10**5) and show:
                print("u: {} \t m: {} \t r: {} \t r_hat: {}".format(user, movie, rating, pred))
                
            err = rating - pred
            loss += err**2
            cntr += 1
        MSE = (loss/len(preds))
        return MSE
    
    """
    Punny name ... saves pickle objects during training phase
    @param:
        out - data structure (dict, etc) to save
        fname - filename
    @Return:
        None
    """  
    def god_save_the_queen(self, out, fname):
        with open(fname, "wb") as f:
            pickle.dump(out, f)
        return None
        
    """
    Uses train and test data to learn U,V matrix decomposition
    """     
    def factorizeMatrix(self):
        print("Factorizing...")
        print("=> learning rate: {}, epochs: {}, r: {}".format(self.lr, self.iters, self.features))
              
        epoch_start = time.time()
        for epoch in range(1,self.iters+1):
            print("\nStarting iteration {}...".format(epoch))
            self.lr = self.lr * .995 # Hacky annealing
            
            best_test_MSE = 50
            cntr = 1
            _s = time.time()
            for row in self.train.itertuples():
                user, movie, rating = row[2], row[3], row[4]
                err = rating - np.dot( self.U[user,:], np.transpose(self.V[:,movie]) )
                train_MSE = err ** 2
                dV = self.lr * (err * 2 * self.U[user,:] - self.lambd * self.V[:,movie])
                dU = self.lr * (err * 2 * self.V[:,movie] - self.lambd * self.U[user,:])
                self.V[:,movie] = self.V[:,movie] + dV
                self.U[user,:] = self.U[user,:] + dU
                cntr += 1
                                
                # Periodically Print Progress 
                if not (cntr % 10**5):
                    _e = time.time()
                    print( "\n {} min runtime to process {:,} rows...\n".format(int((_e-_s)//60), cntr) )
                    test_sample = self.test.sample(frac=0.05)
                    U, V = self.U, self.V
                    test_MSE = calc_MSE(test_sample, U, V, show=False) 
                    self.record_loss(test_MSE, train_MSE, epoch)
                    print( "Train MSE: {0:0.4f}, Test MSE: {1:0.4f}".format(train_MSE, test_MSE))
                    
                    # Periodically Check Test MSE
                    if test_MSE <= best_test_MSE:
                        best_test_MSE = test_MSE
                        u_outfile = "U_mat:_r={}_lambda={}_epoch={}.pkl".format(self.features, self.lambd, epoch)
                        v_outfile = "V_mat:_r={}_lambda={}_epoch={}.pkl".format(self.features, self.lambd, epoch)
                        loss_file = "loss:{:.3f}_r={}_lambda={}_epoch={}.pkl".format(test_MSE, self.features, self.lambd, epoch)
                        self.god_save_the_queen(self.U, u_outfile)
                        self.god_save_the_queen(self.V, v_outfile)
                        self.god_save_the_queen(self.loss_record, loss_file)
            # Track epoch runtime
            epoch_end = time.time()
            print("\n Epoch {} runtime: {} min".format(epoch, int((epoch_end-epoch_start)//60)))

        MRR = calc_MRR(self.test, self.U, self.V)
        with open('MRR.txt', 'w') as f:
            f.write("log:_MRR={:.3f}:_r={}_lambda={}".format(MRR, self.features, self.lambd))

        return self.U, self.V, self.loss_record

            

In [202]:
def update_movieId(movie):
    return item_toKey[movie]

def update_userId(user):
    return user_toKey[user]

In [None]:
train_file = 'ml-20m/train.csv'
test_file = 'ml-20m/test.csv'

train = get_data(train_file)
test = get_data(test_file)
all_data = pd.concat([train, test])
key_toUser, user_toKey = get_user_dicts(all_data)
key_toItem, item_toKey = get_item_dicts(all_data)

train['userId'] = train['userId'].apply(update_userId)
train['movieId'] = train['movieId'].apply(update_movieId)
test['userId'] = test['userId'].apply(update_userId)
test['movieId'] = test['movieId'].apply(update_movieId)

Number of users:  138493
Number of items:  26744


In [None]:
lr = .01
r = 40
epoch = 2
lamda = .2

MF = MatrixFactorize(train, test, lr, r, epoch, lamda)
U, V, loss_record = MF.factorizeMatrix()


Number of users: 138493
Number of movies: 26744
Factorizing...
=> learning rate: 0.01, epochs: 1, r: 40

Starting iteration 1...

 0 min runtime to process 100,000 rows...

Running test validation...
Train MSE: 297.3694, Test MSE: 38.7706

 0 min runtime to process 200,000 rows...

Running test validation...
Train MSE: 0.0607, Test MSE: 33.1755

 0 min runtime to process 300,000 rows...

Running test validation...
Train MSE: 0.0216, Test MSE: 30.1558

 0 min runtime to process 400,000 rows...

Running test validation...
Train MSE: 6.5000, Test MSE: 28.1908

 0 min runtime to process 500,000 rows...

Running test validation...
Train MSE: 3.3417, Test MSE: 26.7850

 0 min runtime to process 600,000 rows...

Running test validation...
Train MSE: 0.5040, Test MSE: 25.8193

 0 min runtime to process 700,000 rows...

Running test validation...
Train MSE: 0.0391, Test MSE: 24.9449

 0 min runtime to process 800,000 rows...

Running test validation...
Train MSE: 0.1751, Test MSE: 24.2598

 0 m

Train MSE: 0.2315, Test MSE: 7.6243

 6 min runtime to process 7,500,000 rows...

Running test validation...
Train MSE: 4.9540, Test MSE: 7.4033

 6 min runtime to process 7,600,000 rows...

Running test validation...
Train MSE: 0.3626, Test MSE: 7.2722

 7 min runtime to process 7,700,000 rows...

Running test validation...
Train MSE: 4.2855, Test MSE: 7.0410

 7 min runtime to process 7,800,000 rows...

Running test validation...
Train MSE: 9.2242, Test MSE: 6.7756

 7 min runtime to process 7,900,000 rows...

Running test validation...
Train MSE: 0.2739, Test MSE: 6.5953

 7 min runtime to process 8,000,000 rows...

Running test validation...
Train MSE: 1.2328, Test MSE: 6.4042

 7 min runtime to process 8,100,000 rows...

Running test validation...
Train MSE: 0.9429, Test MSE: 6.2110

 7 min runtime to process 8,200,000 rows...

Running test validation...
Train MSE: 0.1640, Test MSE: 5.9877

 7 min runtime to process 8,300,000 rows...

Running test validation...
Train MSE: 3.0421, 

In [None]:
def calc_MRR(df, U, V):
    start = time.time()
    MRR = []
    preds = []
    for row in df.itertuples():
        user, movie = row[2], row[3]
        u_vec, v_vec = U[user,:], V[:,movie]
        preds.append( predict_rating(u_vec, v_vec, rnd=True) )
        
    df['pred'] = pd.Series(preds)
    _s1 = len(df)
    df = df.loc[ df['pred'] >= 3.0 ]
    _s2 = len(df)
    print("Percent of rows removed given predictions < 3.0: {0:0.2f}%".format((_s1-_s2)//_s1))
    Q_length = len(df)
    df.sort_values(by=['userId','pred'], ascending=False)

    for user in df['userId'].unique():
        user_data = df.loc[ df['userId'] == user ]
        if len(user_data) > 0:
            rankings = user_data.index[ user_data['rating'] == user_data['pred'] ].tolist()
            if rankings:
                rank = rankings[0] + 1 # To account for 0 indexing
                print(rank)
                MRR.append( 1/rank )  
    print("sum:", sum(MRR))
    print("MRR:", sum(MRR)/Q_length)
    MRR = (sum(MRR)/Q_length) - 1 # Remove index adjustment
    print("MRR Calculation Runtime: {} min".format( int((time.time()-start)//60) ))
    print("Model MRR: ", MRR)
    return MRR 

In [None]:
print( calc_MSE(MF.test,U,V) )

In [None]:
def properties(cls):   
    return [i for i in cls.__dict__.keys() if i[:1] != '_']

properties = props(MF)
print(properties)

In [183]:
V = pickle.load( open( "V_mat:_r=40_lambda=0.01_epoch=10.pkl", "rb" ) )
U = pickle.load( open( "U_mat:_r=40_lambda=0.01_epoch=10.pkl", "rb" ) )
print("U Shape:", U.shape)
print("V Shape:", V.shape)

(138493, 40)
(40, 26744)


In [146]:
loss_file1 = "loss:1.1454883390569282_r=40_lambda=0.2_epoch=10.pkl"
loss_file2 = "loss:1.1104220640763836_r=40_lambda=0.01_epoch=10.pkl"
loss_file3 = "loss:0.8906265173171293_r=100_lambda=0.5_epoch=7.pkl" #epoch 7 ugh
loss_file4 = "loss:0.9162400916571781_r=10_lambda=0.001_epoch=10.pkl"
loss_file5 = "loss:1.2031739446700522_r=50_lambda=0.01_epoch=10.pkl"

In [167]:
def print_loss_data(fname, comma=True):
    losses = pickle.load( open( fname, "rb" ) )
    print("Train MSE    |   Test MSE")
    for tr,te in zip(losses['train'],losses['test']):
        if not comma:
            print("train: {0:0.4f} | test: {1:0.4f}".format(tr,te))
        else:
            print("{0:0.4f} , {1:0.4f}".format(tr,te))
        
def save_losses_to_csv(fname1,fname2):
    losses = pickle.load( open( fname1, "rb" ) )
    s_tr = pd.Series(losses['train'], name='train_MSE')
    s_te = pd.Series(losses['test'], name='test_MSE')
    L = pd.concat([s_tr, s_te], axis=1)
    L.to_csv(fname2)
    print("file saved as:",str(fname2))

In [None]:
losses = pickle.load( open( "loss:1.1454883390569282_r=40_lambda=0.2_epoch=10.pkl", "rb" ) )

In [166]:
print_loss_data(loss_file5,comma=True)

106.8566 , 45.4117
18.1185 , 38.2210
7.8139 , 34.4534
4.8712 , 31.9924
3.9118 , 30.3133
26.8501 , 28.8412
21.8917 , 27.7958
2.1857 , 26.9774
0.5036 , 26.1042
10.5526 , 25.5807
2.2719 , 24.8030
2.6374 , 24.3189
2.1423 , 23.6883
11.9064 , 23.3008
5.9790 , 22.8418
8.7703 , 22.5621
0.0007 , 22.1266
13.6362 , 21.8406
1.2341 , 21.4222
6.4090 , 21.2024
10.7787 , 20.8501
0.4129 , 20.5569
1.7994 , 20.3474
0.3126 , 19.9337
0.3098 , 19.5916
3.3441 , 19.2762
1.3273 , 19.1348
6.7526 , 18.8508
11.9996 , 18.6238
3.5682 , 18.3730
3.4326 , 18.0634
61.4089 , 17.6856
4.0440 , 17.4580
4.9226 , 17.3143
0.2257 , 17.0361
0.0360 , 16.8749
0.0112 , 16.5683
5.7435 , 16.2779
1.0782 , 16.1272
3.0475 , 15.8084
0.4188 , 15.5735
0.0283 , 15.3730
0.6643 , 15.0023
9.0499 , 14.9351
0.4491 , 14.6156
1.4674 , 14.3587
24.0611 , 14.0975
3.3744 , 13.9227
3.0680 , 13.6174
202.3333 , 13.4917
0.9525 , 13.2682
0.3375 , 12.9185
0.3490 , 12.8385
0.4524 , 12.5776
12.2314 , 12.3229
0.0232 , 12.0692
12.9851 , 11.8151
7.0735 , 11.650

In [152]:
csv_file = 'Losses_Model5_r=50_lambda=0.01_epoch10.csv'
save_losses_to_csv(loss_file5, csv_file)

file saved as:  Losses_Model5_r=50_lambda=0.01_epoch10.csv
