In [25]:
# loading packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from itertools import product
from collections import namedtuple
from sklearn.model_selection import train_test_split
from typing import Iterable, Any, Dict

import seaborn as sb

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 8, 6
sb.set_style('whitegrid')

# Data Preparation

In [3]:
# location for the datasets

address2 = '/cdnlogs/batch3/1-1.bz2.ano.bz2'
address = 'CSV/videos.csv'

In [4]:
allcols = ['timestamp','statuscode','contentlength','host','timefirstbyte','timetoserv','hit','contenttype',
           'cachecontrol','cachename','popname','method','protocol','path','uid','sid','livechannel',
           'contentpackage','assetnumber','maxage','coordinates','devicebrand','devicefamily','devicemodel',
           'osfamily','uafamily','uamajor','manifest','fragment']
somecols = ['timestamp','hit','contenttype','uid','livechannel',
            'contentpackage']

missing_values = ['n/a','na','--','NaN','NA','-']

cdnset2 = pd.read_csv(address2,
                     header=0,
                     nrows=10000000,
                     parse_dates=[0],
                     comment='#',
                     usecols=[0,14,16],
                     names=['timestamp','uid','livechannel'],
                     na_values=missing_values
                    )
print(cdnset2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 3 columns):
 #   Column       Dtype         
---  ------       -----         
 0   timestamp    datetime64[ns]
 1   uid          float64       
 2   livechannel  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 228.9 MB
None


In [5]:
cdnset = cdnset2[['timestamp', 'uid', 'livechannel']]

In [None]:
# cdnset = pd.read_csv(address,  na_values=missing_values)[['timestamp', 'uid', 'livechannel']]
# cdnset = pd.read_csv(address,  na_values=missing_values)

In [6]:
cdnset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 3 columns):
 #   Column       Dtype         
---  ------       -----         
 0   timestamp    datetime64[ns]
 1   uid          float64       
 2   livechannel  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 228.9 MB


In [7]:
cdnset.head(2)

Unnamed: 0,timestamp,uid,livechannel
0,2088-05-13 06:59:46,,
1,2088-05-13 06:59:46,,


### Removing missing values

In [8]:
cdnset = cdnset.dropna(axis='index')

In [9]:
cdnset.shape

(4982769, 3)

In [10]:
df_cdn_subset = cdnset[['uid', 'livechannel', 'timestamp']]

In [11]:
df_cdn_frequency = df_cdn_subset.groupby(['uid', 'livechannel'], as_index=False).count()

In [12]:
df_cdn_frequency = df_cdn_frequency.astype(int)
df_cdn_frequency.rename(columns={'timestamp':'requests'}, inplace=True)

In [13]:
df_cdn_frequency

Unnamed: 0,uid,livechannel,requests
0,0,0,16634
1,1,1,18038
2,2,2,3019
3,3,3,18034
4,4,3,406
...,...,...,...
1975,1064,8,5
1976,1065,79,100
1977,1067,116,29
1978,1068,2,8


In [15]:
kubwa = df_cdn_frequency.requests.max()
print(kubwa)

24531


In [17]:
#kubwa = df_cdn_freq.timestamp.max()
df_cdn_frequency['rating'] = np.where(df_cdn_frequency.requests==kubwa,
                5,
                np.where(df_cdn_frequency.requests>=0.75*kubwa,
                        4,
                        np.where(df_cdn_frequency.requests>=0.5*kubwa,
                                3,
                                np.where(df_cdn_frequency.requests>=0.25*kubwa,
                                        2,
                                        np.where(df_cdn_frequency.requests>1,
                                                1,
                                                df_cdn_frequency.requests)))))

In [18]:
df_cdn_frequency

Unnamed: 0,uid,livechannel,requests,rating
0,0,0,16634,3
1,1,1,18038,3
2,2,2,3019,1
3,3,3,18034,3
4,4,3,406,1
...,...,...,...,...
1975,1064,8,5,1
1976,1065,79,100,1
1977,1067,116,29,1
1978,1068,2,8,1


In [30]:
df = df_cdn_frequency

# Log Scale

In [19]:
df_cdn_frequency['log_freq']=np.log10(df_cdn_frequency.requests)

In [20]:
df_cdn_frequency

Unnamed: 0,uid,livechannel,requests,rating,log_freq
0,0,0,16634,3,4.220997
1,1,1,18038,3,4.256188
2,2,2,3019,1,3.479863
3,3,3,18034,3,4.256092
4,4,3,406,1,2.608526
...,...,...,...,...,...
1975,1064,8,5,1,0.698970
1976,1065,79,100,1,2.000000
1977,1067,116,29,1,1.462398
1978,1068,2,8,1,0.903090


In [21]:
df_cdn_frequency['log_rating']=np.rint(df_cdn_frequency.log_freq).astype(int)

In [22]:
df_cdn_frequency

Unnamed: 0,uid,livechannel,requests,rating,log_freq,log_rating
0,0,0,16634,3,4.220997,4
1,1,1,18038,3,4.256188,4
2,2,2,3019,1,3.479863,3
3,3,3,18034,3,4.256092,4
4,4,3,406,1,2.608526,3
...,...,...,...,...,...,...
1975,1064,8,5,1,0.698970,1
1976,1065,79,100,1,2.000000,2
1977,1067,116,29,1,1.462398,1
1978,1068,2,8,1,0.903090,1


In [27]:
# Train-valid-test split

def split_data(data, valid_ratio=0.2, test_ratio=0.2, random_state=42):
    train_ratio = 1 - valid_ratio - test_ratio
    train, remain = train_test_split(data, test_size=1-train_ratio, random_state=random_state)
    valid, test = train_test_split(remain, test_size=test_ratio / (test_ratio + valid_ratio), random_state=random_state)

    return train, valid, test

## Matrix Factorization

### Building user-item matrix and understanding the data

### Matrix Factorization Class

In [23]:
epoch_score = namedtuple('epoch_score', ['epoch', 'train_score', 'valid_score'])


class MatrixFactorization:
    def __init__(self, R_train, R_valid, R_test, K=None, alpha=None, beta=None, n_iter=10, verbose=True):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix for training
        - R_test (ndarray) : user-item rating test matrix for testing
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - n_iter (int)  : number of iterations
        - verbose (bool): print loss after every 10 iterations
        """

        self.R_train = R_train
        self.R_valid = R_valid
        self.R_test = R_test
        self.num_users, self.num_items = R_train.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.n_iter = n_iter
        self.verbose = verbose

    def __initialize_biases(self):
        """
        Initialize the biases
        """
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R_train[np.where(self.R_train != 0)])

    def __initialize_matrices(self):
        """
        Initialize user and item latent feature matrice
        """
        self.P = np.random.normal(scale=1. / self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1. / self.K, size=(self.num_items, self.K))

    def __create_training_samples(self):
        self.samples = [
            (i, j, self.R_train[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R_train[i, j] > 0
        ]

    def __sgd(self):
        """
        Perform stochastic gradient descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.predict(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

    def set_params(self, **params):
        """
        Set the parameters of the model
        """
        self.__dict__.update(params)

    def score_function(self, matrix):
        """
        Compute the score of the matrix factorization model (MSE)
        """
        xs, ys = matrix.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(matrix[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def train_score(self):
        """
        Computer the score of the matrix factorization model (MSE)
        """
        return self.score_function(self.R_train)

    def test_score(self):
        """
        Computer the score of the matrix factorization model (MSE)
        """
        return self.score_function(self.R_test)

    def valid_score(self):
        """
        Computer the score of the matrix factorization model (MSE)
        """
        return self.score_function(self.R_valid)

    def train(self):
        # Initialize user and item latent feature matrices
        self.__initialize_matrices()

        # Initialize biases
        self.__initialize_biases()

        # Create a list of training samples
        self.__create_training_samples()

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.n_iter):
            np.random.shuffle(self.samples)
            self.__sgd()
            train_score = self.train_score()
            valid_score = self.valid_score()
            training_process.append(epoch_score(epoch=i, train_score=train_score, valid_score=valid_score))
            if (i + 1) % 10 == 0 and self.verbose:
                print('Iteration: {} ; Training error {}; Validation error {}'.format(i + 1, train_score, valid_score))

        return training_process

    def predict(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis:, ] + self.P.dot(self.Q.T)


## Custom GridSearch Class

In [26]:
result_tuple = namedtuple('result_tuple', ['parameters', 'scores'])


class CustomGridSearch:
    def __init__(self, estimator, param_grid: Dict[str, Iterable[Any]]):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv_results = list()
        self.best_params = None
        self.best_score = None

    def __parameters_grid_generator(self) -> Iterable[Dict[str, Any]]:
        """
        Generates a list of dictionaries with all possible combinations of parameters
        """
        for params in product(*self.param_grid.values()):
            yield dict(zip(self.param_grid.keys(), params))

    def fit(self):
        """
        Fits the estimator with all possible combinations of parameters
        """
        for params in self.__parameters_grid_generator():
            self.estimator.set_params(**params)
            print(f'Training with params: {params}')
            training_process = self.estimator.train()
            last_epoch_valid_score = training_process[-1].valid_score
            if self.best_score is None or last_epoch_valid_score < self.best_score:
                self.best_score = last_epoch_valid_score
                self.best_params = params
            self.cv_results.append(result_tuple(parameters=params, scores=training_process))


### Doing the GridSearch

In [None]:
#splitting the sets
train, valid, test = split_data(df_cdn_frequency)

In [None]:
train

In [None]:
# print(freq_tab_arr)
# print('num users:',len(freq_tab_arr))
# print('num items:',len(freq_tab_arr[0]))

In [None]:
R_train = scale_frequencies(construct_frequency_table(train))
R_valid = scale_frequencies(construct_frequency_table(valid))
R_test = scale_frequencies(construct_frequency_table(test))

In [None]:
print(R_train)
print('num users:',len(R_train))
print('num items:',len(R_train[0]))

In [None]:
# Defining k_max
k_max = min(len(R_train), len(R_train[0]))
k_max

### Round 1

In [None]:
#The parameters we want to try
params_grid = {
    'K': list(range(120, k_max, 1)),
    'alpha': [r/1000 for r in range(1,10)],
    'beta': [r/100 for r in range(1,10)],
}

In [None]:
#Intializing the model
model = MatrixFactorization(R_train, R_valid, R_test, n_iter=100)

In [None]:
gridsearch = CustomGridSearch(model, params_grid)

In [None]:
gridsearch.fit()

In [None]:
gridsearch.best_score

In [None]:
gridsearch.best_params

### Round 2

In [None]:
#The parameters we want to try
params_grid = {
    'K': list(range(1, k_max, 1)),
    'alpha': [0.1, 0.2, 0.3],
    'beta': [0.02],
}

In [None]:
#Intializing the model
model = MatrixFactorization(R_train, R_valid, R_test, n_iter=100)

In [None]:
gridsearch = CustomGridSearch(model, params_grid)

In [None]:
gridsearch.fit()

In [None]:
gridsearch.best_score

In [None]:
gridsearch.best_params

## Train a model with the recommended parameters

In [29]:
params = {'K': 50, 'alpha': 0.2, 'beta': 1e-05}

In [None]:
best_model = MatrixFactorization(R=my_tab_arr, n_iter=100, **params)

In [None]:
training_process = best_model.train()

In [None]:
#plot the training process
epochs, scores = zip(*training_process)
plt.figure(figsize=(15,7))
plt.plot(epochs, scores)
plt.xticks(np.arange(0, 101, 5))
plt.show()