In [1]:
# preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
address = '/cdnlogs/batch3/1-1.bz2.ano.bz2'
allcols = ['timestamp','statuscode','contentlength','host','timefirstbyte','timetoserv','hit','contenttype',
           'cachecontrol','cachename','popname','method','protocol','path','uid','sid','livechannel',
           'contentpackage','assetnumber','maxage','coordinates','devicebrand','devicefamily','devicemodel',
           'osfamily','uafamily','uamajor','manifest','fragment']
missing_values = ['n/a','na','--','NaN','NA','-']
df = pd.read_csv(address,
                     header=0,
                     nrows=10000000,
                     parse_dates=[0],
                     comment='#',
                     names=allcols,
                     na_values=missing_values
                    )
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 29 columns):
 #   Column          Dtype         
---  ------          -----         
 0   timestamp       datetime64[ns]
 1   statuscode      int64         
 2   contentlength   float64       
 3   host            int64         
 4   timefirstbyte   float64       
 5   timetoserv      float64       
 6   hit             object        
 7   contenttype     object        
 8   cachecontrol    object        
 9   cachename       int64         
 10  popname         int64         
 11  method          object        
 12  protocol        object        
 13  path            int64         
 14  uid             float64       
 15  sid             float64       
 16  livechannel     float64       
 17  contentpackage  float64       
 18  assetnumber     float64       
 19  maxage          float64       
 20  coordinates     float64       
 21  devicebrand     float64       
 22  devicefamily    f

In [13]:
# selecting features pertaining to matrix factorization for rating prediction
df = df[['timestamp','uid','livechannel']]
# dropping null values
df_clean = df.dropna(subset=['uid'])
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4982835 entries, 2 to 9999998
Data columns (total 3 columns):
 #   Column       Dtype         
---  ------       -----         
 0   timestamp    datetime64[ns]
 1   uid          float64       
 2   livechannel  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 152.1 MB


In [14]:
df_cdn = df_clean.groupby(['uid', 'livechannel'], as_index=False).count()
df_cdn = df_cdn.astype(int)
df_cdn.rename(columns={'timestamp':'requests'}, inplace=True)
df_cdn.head(2)

Unnamed: 0,uid,livechannel,requests
0,0,0,16634
1,1,1,18038


In [15]:
df_cdn['log_requests'] = np.rint(np.log(df_cdn['requests'])).astype(int)
df_cdn.head(3)

Unnamed: 0,uid,livechannel,requests,log_requests
0,0,0,16634,10
1,1,1,18038,10
2,2,2,3019,8


In [16]:
# this will be reviewed, idea is request = 1 then log_scale = 1 as well
df_cdn['log_requests'] = np.where(df_cdn['requests']==1, 1, df_cdn['log_requests'])

In [17]:
df_cdn['log_requests'] = df_cdn['log_requests'].astype(int)
df_cdn

Unnamed: 0,uid,livechannel,requests,log_requests
0,0,0,16634,10
1,1,1,18038,10
2,2,2,3019,8
3,3,3,18034,10
4,4,3,406,6
...,...,...,...,...
1975,1064,8,5,2
1976,1065,79,100,5
1977,1067,116,29,3
1978,1068,2,8,2


In [18]:
df_matrix = df_cdn[['uid','livechannel','log_requests']]

In [19]:
df_matrix

Unnamed: 0,uid,livechannel,log_requests
0,0,0,10
1,1,1,10
2,2,2,8
3,3,3,10
4,4,3,6
...,...,...,...
1975,1064,8,2
1976,1065,79,5
1977,1067,116,3
1978,1068,2,2


In [22]:
print('unique uid: ',len(df_matrix.uid.unique()))
print('unique livetv: ',len(df_matrix.livechannel.unique()))

unique uid:  1038
unique livetv:  149


In [23]:
# splitting the datasets into train and test sets
X_train, X_test = train_test_split(df_matrix, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(1386, 3)
(594, 3)


In [24]:
print('total records',len(X_train)+len(X_test))
print(len(df_matrix))

total records 1980
1980


In [25]:
X_train.to_csv('../CSV/LTV_train.csv', index = False, header = False)
X_test.to_csv('../CSV/LTV_test.csv', index = False, header = False)

In [73]:
# import sys
# sys.path.append("/home/jovyan/work/MyMediaLite-3.11/lib/mymedialite")

import clr
clr.AddReference('/home/jovyan/work/MyMediaLite-3.11/lib/mymedialite/MyMediaLite.dll')
# from MyMediaLite import *
from MyMediaLite import IO
from MyMediaLite import RatingPrediction
from MyMediaLite import Eval


# load the data
train_data = IO.RatingData.Read('../CSV/VoD_train.csv')
test_data  = IO.RatingData.Read('../CSV/VoD_test.csv')

# set up the recommender
# recommender = RatingPrediction.UserItemBaseline() # don't forget ()
recommender = RatingPrediction.BiasedMatrixFactorization()

# recommender.RegU = 0.05
# recommender.RegI = 0.05
recommender.Regularization = 0.05
recommender.LearnRate = 0.07
recommender.NumFactors = 52
recommender.NumIter = 100

recommender.Ratings = train_data
recommender.Train()
# num_factors=60 bias_reg=0.001 regularization=0.060 learn_rate=0.07 num_iter=100 bold_driver=true
# measure the accuracy on the test data set
evaluations1 = Eval.Ratings.Evaluate(recommender, test_data)
print('evaluation on test data:',evaluations1)

# measure the accuracy on the train data set
evaluations2 = Eval.Ratings.Evaluate(recommender, train_data)
print('evaluation on train data:',evaluations2)

# make a prediction for a certain user and item
print(recommender.Predict(2, 2))

evaluation on test data: RMSE 0.9997743 MAE 0.6346317 CBD 0.1202117
evaluation on train data: RMSE 0.1628978 MAE 0.1394366 CBD 0.07370228
1.673049


In [71]:
print('latent factors:',recommender.NumFactors,'learn rate:',recommender.LearnRate,
      'regi:',recommender.RegI,'regu:',recommender.RegU,'iterations:',recommender.NumIter)
#       'regularization',recommender.Regularization)

latent factors: 52 learn rate: 0.07 regi: 0.05 regu: 0.05 iterations: 100
