In [3]:
import pandas as pd
import numpy as np
from random import sample
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import pickle

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/trainwithzerostopredict.csv')
topredict= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/topredict.csv')
df_withoutzero= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/trainwithoutzeros.csv')

***df has all the topredict values entered as 0 for the Book-Rating column.***</br>
***topredict has all the values that we want to predict.***</br>
***df_withoutzero has all the topredict all the values including the values in the test dataset.***



In [6]:
df['User-ID'].nunique()

77805

In [7]:
topredict['User-ID'].nunique()

19935

##**1000 users that rate the most**

In [8]:
df_num_rating= df.groupby('User-ID').agg(Number_of_ratings=('Book-Rating','count')).reset_index()
# Num ratings count

In [9]:
df_num_rating.sort_values(by=['Number_of_ratings'], ascending=False, inplace=True)
df_num_rating

Unnamed: 0,User-ID,Number_of_ratings
3160,11676,8524
27626,98391,5802
43027,153662,1969
52924,189835,1906
6510,23902,1395
...,...,...
23601,84129,1
46266,165812,1
9606,34231,1
46268,165826,1


In [10]:
top_users= df_num_rating['User-ID'][:1000]
# top 1500 users

##**To Predict User List and Batch Designation**

In [11]:
topredict_users= topredict['User-ID'].unique()
topredict_users

array([    17,     56,    114, ..., 278844, 278851, 278854])

In [12]:
len(topredict_users)

19935

**We divide the test into 20 folds because of memory limitations. We will predict the test values by loading in batches**

In [13]:
dict_batch= {}
for i in range(0,len(topredict_users),500):
  dict_batch['fold_{}'.format(i//500)]= topredict_users[i:i+500]

In [14]:
dict_batch #fold_0 to fold_19#

{'fold_0': array([  17,   56,  114,  160,  183,  242,  243,  254,  272,  289,  300,
         362,  383,  388,  392,  408,  424,  440,  441,  444,  446,  472,
         476,  487,  503,  505,  507,  566,  619,  625,  626,  638,  640,
         643,  651,  657,  695,  709,  726,  741,  744,  746,  753,  776,
         786,  805,  819,  850,  853,  882,  885,  896,  899,  900,  901,
         914,  929, 1009, 1021, 1025, 1031, 1063, 1075, 1083, 1096, 1113,
        1116, 1129, 1131, 1155, 1161, 1167, 1178, 1184, 1211, 1235, 1248,
        1249, 1254, 1261, 1297, 1343, 1376, 1409, 1412, 1421, 1424, 1435,
        1436, 1467, 1485, 1517, 1548, 1558, 1570, 1585, 1596, 1597, 1619,
        1652, 1660, 1667, 1674, 1688, 1696, 1733, 1790, 1791, 1797, 1805,
        1830, 1848, 1903, 1990, 2009, 2010, 2024, 2030, 2033, 2041, 2046,
        2084, 2090, 2103, 2106, 2110, 2132, 2134, 2179, 2189, 2197, 2203,
        2240, 2255, 2276, 2287, 2288, 2313, 2333, 2337, 2354, 2358, 2363,
        2385, 2399, 2404, 24

In [15]:
# this is the matrix factorization function from the MF notebook. 
def MF(M,k,max_it,lambd,mu):
    n=M.size()[0]
    m= M.size()[1]
    nonzero=len(M.nonzero())
    index= M.nonzero().split(1, dim=1)
    #param=torch.rand(n*k+k*m,dtype=float,requires_grad=True)
    param1=torch.rand((n,k),dtype=torch.float,requires_grad=True)
    param2=torch.rand((k,m),dtype=torch.float,requires_grad=True)

    opt1= torch.optim.Adam([param1],lr=0.1)
    opt2= torch.optim.Adam([param2],lr=0.1)
    
    #scheduler1= ReduceLROnPlateau(opt1, 'min') 
    #scheduler2 = ReduceLROnPlateau(opt2, 'min')
    

    #def get_loss(params,params_hat):
        #return torch.sum(torch.square(params- params_hat))

    def run_iterations(max_it):
        loss_record=[]
        converged=False
        for it in tqdm(range(max_it)):
            if it%2==0:
                opt1.zero_grad(set_to_none=True)
                #torch.matmul(param[:n*k].reshape(n,k), pam[n*k:].reshape(k,m))
                loss=torch.sum(torch.square(torch.matmul(param1, param2)[index]- M[index])) + lambd*torch.sum(torch.square(param1))+mu*torch.sum(torch.square(param2))
                loss_record.append(loss.item())
                loss.backward()
                opt1.step()
                #scheduler1.step(loss)
            else:
                opt2.zero_grad(set_to_none=True)
                #torch.matmul(param[:n*k].reshape(n,k), pam[n*k:].reshape(k,m))
                loss=torch.sum(torch.square(torch.matmul(param1, param2)[index]- M[index])) + lambd*torch.sum(torch.square(param1))+mu*torch.sum(torch.square(param2))
                loss_record.append(loss.item())
                loss.backward()
                opt2.step()
                #scheduler2.step(loss)
        display("loss record: {}".format(loss_record[-1]))
        return torch.matmul(param1,param2)
    return run_iterations(max_it)

**The following loop loads the ith batch of the train dataset and constructs a matrix using the ith batch and the subset of the train dataset containing the 1000 top users. Then it computes the matrix factorization prediction on the ith batch. It calculates the MSE for each batch in each iteration.**

In [None]:
mse_batch=[] # this will collect the batch MSE
indices_batch=[] # this will collect the indices of the test dataset in each batch
for i in range(len(dict_batch)): #length of dict_batch is 20 because we have 20 folds
  df_batch= topredict[topredict['User-ID'].isin(dict_batch['fold_{}'.format(i)])].reset_index(drop=True)
  df_matrix= df[df['User-ID'].isin(list(dict_batch['fold_{}'.format(i)])+list(top_users))].reset_index(drop=True) #combines data from 500 top users and the test batch
  mat= df_matrix.pivot(index='User-ID',columns='ISBN',values='Book-Rating') .fillna(0) # matrix using pivot
  matrix= torch.tensor(mat.values)/10 #converts to tensor and scales the values by 1/10
  dict_user= dict(zip(sorted(set(df_matrix['User-ID'])),range(len(sorted(set(df_matrix['User-ID']))))))
  dict_book= dict(zip(sorted(set(df_matrix['ISBN'])),range(len(sorted(set(df_matrix['ISBN']))))))
  index1=[] #this will collect the row index
  index2=[] #this will collect the column index
  for j in range(len(df_batch)):
    index1.append([dict_user[df_batch['User-ID'][j]]])
    index2.append([dict_book[df_batch['ISBN'][j]]])
  indices_topred = (torch.tensor(index1),torch.tensor(index1)) #this contains the indices from the matrix whose values we want to predict
  # this is a bit peculiar but torch tensors work this way
  df_actual= df_withoutzero[df_withoutzero['User-ID'].isin(list(dict_batch['fold_{}'.format(i)])+list(top_users))]
  # df_actual is the matrix with the ratings of the test dataset included
  # we need this to calculate MSE
  actual_matrix= df_actual.pivot(index='User-ID',columns='ISBN',values='Book-Rating') .fillna(0)
  actual_matrix= torch.tensor(actual_matrix.values)/10 # convert to tensor and scale like above

  mse_batch.append(100*torch.sum(torch.square(MF(matrix,10,1000,0.1,0.1)[indices_topred]- actual_matrix[indices_topred])))
  # we multiply by 10^2 because we scaled by 1/10 and squared
  # the values 25,0.1,0.1 come from the MF validation notebook where we performed validation for parameter tuning 
  indices_batch.append(len(indices_topred[0]))
  display("iteration: {}".format(i))
  display("rmse_batch: {}".format(torch.sqrt(mse_batch[-1]/indices_batch[-1])))
 






100%|██████████| 1000/1000 [08:52<00:00,  1.88it/s]


'loss record: 1610.5978465752178'

'iteration: 0'

'rmse_batch: 2.1254843011057507'

100%|██████████| 1000/1000 [09:24<00:00,  1.77it/s]


'loss record: 1606.9487527105289'

'iteration: 1'

'rmse_batch: 4.525947692023039'

100%|██████████| 1000/1000 [13:38<00:00,  1.22it/s]


'loss record: 1607.7146702535724'

'iteration: 2'

'rmse_batch: 2.2738697133313224'

100%|██████████| 1000/1000 [10:14<00:00,  1.63it/s]


'loss record: 1611.1625331235978'

'iteration: 3'

'rmse_batch: 2.5769078021088547'

100%|██████████| 1000/1000 [07:50<00:00,  2.13it/s]


'loss record: 1675.9527783808678'

'iteration: 4'

'rmse_batch: 2.798251983606105'

100%|██████████| 1000/1000 [09:29<00:00,  1.76it/s]


'loss record: 1604.9418272115804'

'iteration: 5'

'rmse_batch: 2.691210717162478'

100%|██████████| 1000/1000 [09:13<00:00,  1.81it/s]


'loss record: 5128.1798360404855'

'iteration: 6'

'rmse_batch: 2.251708220080328'

100%|██████████| 1000/1000 [08:59<00:00,  1.85it/s]


'loss record: 1603.6672188912353'

'iteration: 7'

'rmse_batch: 2.753602476073295'

100%|██████████| 1000/1000 [07:36<00:00,  2.19it/s]


'loss record: 1610.8497777876555'

'iteration: 8'

'rmse_batch: 2.5953651427834377'

100%|██████████| 1000/1000 [09:21<00:00,  1.78it/s]


'loss record: 1607.5339277536982'

'iteration: 9'

'rmse_batch: 2.7597517144412897'

100%|██████████| 1000/1000 [10:42<00:00,  1.56it/s]


'loss record: 1617.207006332279'

'iteration: 10'

'rmse_batch: 2.6539170494195456'

100%|██████████| 1000/1000 [11:03<00:00,  1.51it/s]


'loss record: 1624.41764786528'

'iteration: 11'

'rmse_batch: 3.1453464811352956'

100%|██████████| 1000/1000 [11:26<00:00,  1.46it/s]


'loss record: 1609.4743158704446'

'iteration: 12'

'rmse_batch: 2.3291899434309755'

100%|██████████| 1000/1000 [10:44<00:00,  1.55it/s]


'loss record: 1609.5477186836445'

'iteration: 13'

'rmse_batch: 2.197592074628841'

100%|██████████| 1000/1000 [11:20<00:00,  1.47it/s]


'loss record: 1705.2948380831149'

'iteration: 14'

'rmse_batch: 2.479030890103321'

100%|██████████| 1000/1000 [08:24<00:00,  1.98it/s]


'loss record: 1612.402821025319'

'iteration: 15'

'rmse_batch: 2.780036179923561'

100%|██████████| 1000/1000 [11:37<00:00,  1.43it/s]


'loss record: 1600.5886982504203'

'iteration: 16'

'rmse_batch: 2.995295811321224'

100%|██████████| 1000/1000 [10:51<00:00,  1.54it/s]


'loss record: 1599.3376715864183'

'iteration: 17'

'rmse_batch: 2.5895854805582856'

100%|██████████| 1000/1000 [10:38<00:00,  1.57it/s]


'loss record: 1605.5175210849925'

'iteration: 18'

'rmse_batch: 2.5474045188947554'

100%|██████████| 1000/1000 [10:39<00:00,  1.56it/s]


'loss record: 1601.0770619274078'

'iteration: 19'

'rmse_batch: 2.7878996670137384'

100%|██████████| 1000/1000 [07:28<00:00,  2.23it/s]


'loss record: 1614.1357931739458'

'iteration: 20'

'rmse_batch: 3.18674415495354'

100%|██████████| 1000/1000 [08:30<00:00,  1.96it/s]


'loss record: 1605.8902356752074'

'iteration: 21'

'rmse_batch: 2.4485921643575796'

100%|██████████| 1000/1000 [08:04<00:00,  2.06it/s]


'loss record: 1676.6681115369038'

'iteration: 22'

'rmse_batch: 3.1668472153008764'

100%|██████████| 1000/1000 [08:36<00:00,  1.94it/s]


'loss record: 1602.0135636955888'

'iteration: 23'

'rmse_batch: 2.428267488267286'

100%|██████████| 1000/1000 [09:23<00:00,  1.77it/s]


'loss record: 1640.004357332699'

'iteration: 24'

'rmse_batch: 2.454380826487022'

100%|██████████| 1000/1000 [11:18<00:00,  1.47it/s]


'loss record: 1613.361774647818'

'iteration: 25'

'rmse_batch: 2.6477776595152993'

100%|██████████| 1000/1000 [09:16<00:00,  1.80it/s]


'loss record: 1615.2236169431485'

'iteration: 26'

'rmse_batch: 2.611982194409302'

100%|██████████| 1000/1000 [09:16<00:00,  1.80it/s]


'loss record: 1604.9265616873315'

'iteration: 27'

'rmse_batch: 2.710613468508012'

100%|██████████| 1000/1000 [10:23<00:00,  1.60it/s]


'loss record: 1611.2282789340961'

'iteration: 28'

'rmse_batch: 2.729546650665154'

100%|██████████| 1000/1000 [10:57<00:00,  1.52it/s]


'loss record: 1607.2066332898899'

'iteration: 29'

'rmse_batch: 2.94725554846423'

100%|██████████| 1000/1000 [09:26<00:00,  1.76it/s]


'loss record: 1612.023383884756'

'iteration: 30'

'rmse_batch: 2.750856674361855'

100%|██████████| 1000/1000 [08:45<00:00,  1.90it/s]


'loss record: 1602.0696251775569'

'iteration: 31'

'rmse_batch: 2.420726671154348'

100%|██████████| 1000/1000 [07:35<00:00,  2.19it/s]


'loss record: 1600.1680276469897'

'iteration: 32'

'rmse_batch: 2.8793388449947876'

100%|██████████| 1000/1000 [07:54<00:00,  2.11it/s]


'loss record: 1611.0118737843231'

'iteration: 33'

'rmse_batch: 2.18368908319945'

100%|██████████| 1000/1000 [09:47<00:00,  1.70it/s]


'loss record: 1617.6640245336912'

'iteration: 34'

'rmse_batch: 2.4756169360381386'

100%|██████████| 1000/1000 [09:36<00:00,  1.74it/s]


'loss record: 1605.8110297148803'

'iteration: 35'

'rmse_batch: 2.837778644983536'

100%|██████████| 1000/1000 [09:23<00:00,  1.78it/s]


'loss record: 1589.1649763898017'

'iteration: 36'

'rmse_batch: 2.522839723983094'

  4%|▎         | 37/1000 [00:21<09:12,  1.74it/s]

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/mse_batch", "wb") as fp:   #Pickling
  pickle.dump(mse_batch, fp)

with open("/content/drive/MyDrive/Colab Notebooks/indices_num_batch", "wb") as fp:   #Pickling
  pickle.dump(indices_batch, fp)

In [None]:
mse= sum(mse_batch)/sum(indices_batch)

In [None]:
mse

In [None]:
rmse= np.sqrt(mse.detach())
rmse