<a href="https://colab.research.google.com/github/BharathSShankar/DSA4212_Assignments/blob/bharath-exp/XGBoostingOnResid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install scikit-surprise
!pip install implicit

In [None]:
# mount the Google Drive
from google.colab import drive
drive.mount("/content/drive")

import torch
import pandas as pd
import numpy as np
import xgboost
from torch import nn 
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

np.random.seed(42)

# goto to data folder -- you may need to change this location
%cd /content/drive/MyDrive/DSA4212/Assignment\ 2/assignment_2_data

Mounted at /content/drive
/content/drive/MyDrive/DSA4212/Assignment 2/assignment_2_data


In [None]:
anime_df = pd.read_csv("assignment_2_anime.csv")
train_df = pd.read_csv("assignment_2_ratings_train.csv")
test_df = pd.read_csv("assignment_2_ratings_test.csv")

In [None]:
# We take an average over the repeated values, this is because we are unsure if keeping the last one is really the best
print("Before before handling duplicates: " + str(train_df[train_df.duplicated(["user_id","anime_id"])].empty))
train_df = train_df.groupby(["user_id","anime_id"]).mean().reset_index()
print("After before handling duplicates: " + str(train_df[train_df.duplicated(["user_id","anime_id"])].empty)) # check duplicate

Before before handling duplicates: False
After before handling duplicates: True


In [None]:
train_df = train_df.sort_values(by = ["user_id", "anime_id"])

In [None]:
# Fill in unknown values
anime_df["type"] = anime_df["type"].fillna("Unknown")

In [None]:
# Create a Reader object for parsing the ratings dataframes
reader = Reader(rating_scale=(1, 10))

# Load trainset and testset from your pre-split rtrain and rtest dataframes
train_data = Dataset.load_from_df(train_df, reader)
trainset = train_data.build_full_trainset()

test_data = Dataset.load_from_df(test_df, reader)
testset = test_data.construct_testset(raw_testset=test_data.raw_ratings)

In [None]:
# Train the SVD algorithm on the trainset
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f76de870a00>

In [None]:
svd.qi.shape

(9632, 100)

In [None]:
anime_df["genre"] = anime_df["genre"].fillna("Other")
genre_set = set()
for i in range(len(anime_df)):
    genre_set.update(anime_df.loc[i, "genre"].split(", "))
genre_dict = {genre: i for i , genre in enumerate(genre_set)}
idx_dict = {i:genre for genre, i in genre_dict.items()}
cols_list = [idx_dict[i] for i in range(len(genre_dict))]
genre_mat = np.zeros((len(anime_df), len(genre_set)))
for i in range(len(anime_df)):
    for j in anime_df.loc[i, "genre"].split(", "):
        genre_mat[i, genre_dict[j]] += 1
anime_df[cols_list] = genre_mat

In [None]:
anime_data_sel = anime_df.iloc[:,  4:]
anime_data_sel["anime_id"] = anime_df["anime_id"]

In [None]:
anime_data_sel = pd.concat([pd.get_dummies(anime_df.type), anime_data_sel], axis = 1).sort_values("anime_id")

In [None]:
class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=100):
    super().__init__()
    self.user_factors = nn.Embedding(n_users, n_factors)
    self.item_factors = nn.Embedding(n_items, n_factors)
    self.user_biases = nn.Embedding(n_users, 1)
    self.item_biases = nn.Embedding(n_items, 1)

  def load(self, algo, mu):
    self.mu = mu

    # Convert the NumPy array to a PyTorch tensor
    pu = torch.from_numpy(algo.pu)
    qi = torch.from_numpy(algo.qi)
    bu = torch.from_numpy(algo.bu[...,np.newaxis])
    bi = torch.from_numpy(algo.bi[...,np.newaxis])

    # Initialize the Linear layer's weight using the torch_weights tensor
    self.user_factors.weight.data = pu
    self.item_factors.weight.data = qi
    self.user_biases.weight.data = bu
    self.item_biases.weight.data = bi

  def forward(self, user, item):
    pred = torch.clip(torch.diagonal(self.mu + self.user_biases(user) + self.item_biases(item) + ((self.user_factors(user) * self.item_factors(item)).sum(dim=1))), 1, 10)
    return pred.squeeze()

In [None]:
baseModel = MatrixFactorization(trainset.n_users,
                    trainset.n_items)
baseModel.load(svd, trainset.global_mean)

In [None]:
user_train = torch.tensor([trainset._raw2inner_id_users[i] for i in train_df["user_id"]])
anime_train = torch.tensor([trainset._raw2inner_id_items[i] for i in train_df["anime_id"]])
rating_train = torch.tensor(np.array(train_df["rating"]))

def process_in_batches(batch_size, user_train, anime_train, rating_train, model):
    num_data = len(user_train)
    num_batches = num_data // batch_size + (num_data % batch_size > 0)
    
    all_results = []
    pred_ = []
    for batch_idx in tqdm(range(num_batches)):
        batch_start = batch_idx * batch_size
        batch_end = min((batch_idx + 1) * batch_size, num_data)
        
        batch_user_train = user_train[batch_start:batch_end]
        batch_anime_train = anime_train[batch_start:batch_end]
        batch_rating_train = rating_train[batch_start:batch_end]
        
        pred = model(batch_user_train, batch_anime_train).detach().numpy()
        batch_result = batch_rating_train - pred
        all_results.append(batch_result)
        pred_.append(pred)
    
    return  np.concatenate(pred_), np.concatenate(all_results)

# Example usage
batch_size = 1028
pred, res_train = process_in_batches(batch_size, user_train, anime_train, rating_train, baseModel)
print(np.mean(res_train**2))

  0%|          | 0/4316 [00:00<?, ?it/s]

0.56176362771981


In [36]:
svd.test(testset)

[Prediction(uid=44017, iid=13161, r_ui=4.0, est=7.153942313722943, details={'was_impossible': False}),
 Prediction(uid=14307, iid=14993, r_ui=7.0, est=7.171747106129959, details={'was_impossible': False}),
 Prediction(uid=55155, iid=268, r_ui=9.0, est=9.191621546036489, details={'was_impossible': False}),
 Prediction(uid=63515, iid=2889, r_ui=9.0, est=8.400810698931489, details={'was_impossible': False}),
 Prediction(uid=54059, iid=2581, r_ui=7.0, est=8.355483823787806, details={'was_impossible': False}),
 Prediction(uid=7042, iid=4654, r_ui=7.0, est=7.52615969000341, details={'was_impossible': False}),
 Prediction(uid=29732, iid=513, r_ui=7.0, est=7.9493155003408065, details={'was_impossible': False}),
 Prediction(uid=31633, iid=22199, r_ui=6.0, est=7.247420331706366, details={'was_impossible': False}),
 Prediction(uid=46380, iid=934, r_ui=10.0, est=8.108651634300948, details={'was_impossible': False}),
 Prediction(uid=8092, iid=1535, r_ui=7.0, est=8.951783840133809, details={'was_imp

In [None]:
train_df["pred"] = pred
train_df["resid"] = res_train

In [None]:
train_df[:48].merge(anime_data_sel, how = "left", on = ["anime_id"])

Unnamed: 0,user_id,anime_id,rating_x,pred,resid,Movie,Music,ONA,OVA,Special,...,Kids,Police,Comedy,Josei,Space,Seinen,Dementia,Psychological,Slice of Life,Shoujo
0,1,11617,10.0,9.493171,0.506829,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,11757,10.0,9.997052,0.002948,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,15451,10.0,9.744576,0.255424,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,20,8.0,8.137201,-0.137201,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,154,6.0,6.990557,-0.990557,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,199,10.0,9.283291,0.716709,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3,225,9.0,8.446375,0.553625,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3,341,6.0,7.022913,-1.022913,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3,430,7.0,7.525649,-0.525649,1,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,3,527,7.0,7.416495,-0.416495,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
class Iterator(xgboost.DataIter):
  def __init__(self, train_dat, anime_data):
    self.train_dat = train_dat
    self.anime_data = anime_data
    self.batch_size = 2048
    self._it = 0
    super().__init__()

  def next(self, input_data):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``

    """
    if self._it == len(self.train_dat)//self.batch_size:
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    new_data = self.train_dat[self._it * self.batch_size: (self._it + 1) * self.batch_size].merge(self.anime_data, how = "left", on = ["anime_id"])
    X = new_data.iloc[:, 5:]
    y = new_data.loc[:, "resid"]
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

In [None]:
batch_size=2048
number_of_steps=train_df.shape[0]//batch_size

clf = xgboost.XGBRegressor(max_depth=200, n_estimators=400, subsample=1, learning_rate=0.07, reg_lambda=0.1, reg_alpha=0.1,\
                       gamma=1)
 
for step in tqdm(range(number_of_steps)):
    start=step*batch_size
    end=(step + 1)*(batch_size)
    X = train_df[start: end].merge(anime_data_sel, how = "left", on = ["anime_id"])
    current_x=X.iloc[:, 5:].to_numpy()
    current_y=X.loc[:, "resid"].to_numpy() 
    clf.fit(current_x, current_y)

  0%|          | 0/2166 [00:00<?, ?it/s]