In [1]:
# preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [46]:
address = '../CSV/VoD_ELK_Parallel.csv'
missing_values = ['n/a','na','--','NaN','NA','-']
df = pd.read_csv(address, na_values=missing_values, parse_dates=[18])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3984430 entries, 0 to 3984429
Data columns (total 30 columns):
 #   Column          Dtype              
---  ------          -----              
 0   devicemodel     float64            
 1   popname         int64              
 2   uid             float64            
 3   timetoserv      float64            
 4   maxage          float64            
 5   devicefamily    float64            
 6   fragment        bool               
 7   coordinates     float64            
 8   hit             object             
 9   uamajor         float64            
 10  cachecontrol    object             
 11  timefirstbyte   float64            
 12  livechannel     float64            
 13  contentpackage  float64            
 14  statuscode      int64              
 15  uafamily        float64            
 16  osfamily        float64            
 17  contentlength   float64            
 18  @timestamp      datetime64[ns, UTC]
 19  @version        int64

In [48]:
# selecting features pertaining to matrix factorization for rating prediction
df = df[['@timestamp','sid','contentpackage']]
# dropping null values
df_clean = df.dropna(subset=['sid'])
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3984430 entries, 0 to 3984429
Data columns (total 3 columns):
 #   Column          Dtype              
---  ------          -----              
 0   @timestamp      datetime64[ns, UTC]
 1   sid             int64              
 2   contentpackage  float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 91.2 MB


In [49]:
df_cdn = df_clean.groupby(['sid', 'contentpackage'], as_index=False).count()
df_cdn = df_cdn.astype(int)
df_cdn.rename(columns={'@timestamp':'requests'}, inplace=True)
df_cdn.head(2)

Unnamed: 0,sid,contentpackage,requests
0,37,3528,78
1,412,0,1


In [50]:
df_cdn['log_requests'] = np.rint(np.log(df_cdn['requests'])).astype(int)
df_cdn.head(3)

Unnamed: 0,sid,contentpackage,requests,log_requests
0,37,3528,78,4
1,412,0,1,0
2,414,0,13,3


In [51]:
# this will be reviewed, idea is request = 1 then log_scale = 1 as well
df_cdn['log_requests'] = np.where(df_cdn['requests']==1, 1, df_cdn['log_requests'])

In [52]:
df_cdn['log_requests'] = df_cdn['log_requests'].astype(int)
df_cdn

Unnamed: 0,sid,contentpackage,requests,log_requests
0,37,3528,78,4
1,412,0,1,1
2,414,0,13,3
3,415,0,13,3
4,464,1,1,1
...,...,...,...,...
206329,1281499,2753,1,1
206330,1281500,2753,1,1
206331,1281501,2753,1,1
206332,1281502,2753,1,1


In [53]:
df_matrix = df_cdn[['sid','contentpackage','log_requests']]

In [54]:
df_matrix

Unnamed: 0,sid,contentpackage,log_requests
0,37,3528,4
1,412,0,1
2,414,0,3
3,415,0,3
4,464,1,1
...,...,...,...
206329,1281499,2753,1
206330,1281500,2753,1
206331,1281501,2753,1
206332,1281502,2753,1


In [56]:
print('unique uid: ',len(df_matrix.sid.unique()))
print('unique vod: ',len(df_matrix.contentpackage.unique()))

unique uid:  187302
unique vod:  3940


In [57]:
# splitting the datasets into train and test sets
X_train, X_test = train_test_split(df_matrix, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(144433, 3)
(61901, 3)


In [12]:
print('total records',len(X_train)+len(X_test))
print(len(df_matrix))

total records 17030
17030


In [13]:
X_train.to_csv('../CSV/VoD_train.csv', index = False, header = False)
X_test.to_csv('../CSV/VoD_test.csv', index = False, header = False)

In [None]:
# import sys
# sys.path.append("/home/jovyan/work/MyMediaLite-3.11/lib/mymedialite")

import clr
clr.AddReference('/home/jovyan/work/MyMediaLite-3.11/lib/mymedialite/MyMediaLite.dll')
# from MyMediaLite import *
from MyMediaLite import IO
from MyMediaLite import RatingPrediction
from MyMediaLite import Eval

# load the data
train_data = IO.RatingData.Read('../CSV/VoD_train.csv')
test_data  = IO.RatingData.Read('../CSV/VoD_test.csv')

# set up the recommender
# recommender = RatingPrediction.UserItemBaseline() # don't forget ()
recommender = RatingPrediction.BiasedMatrixFactorization()
# recommender.Regularization = 0.015
# recommender.RegU = 0.05
# recommender.RegI = 0.05
recommender.Regularization = 0.06
recommender.LearnRate = 0.07
recommender.NumFactors = 60
recommender.NumIter = 100
recommender.Ratings = train_data
recommender.Train()
# num_factors=60 bias_reg=0.001 regularization=0.060 learn_rate=0.07 num_iter=100 bold_driver=true
# measure the accuracy on the test data set
print('evaluation on test data:',Eval.Ratings.Evaluate(recommender, test_data))

# measure the accuracy on the train data set
print('evaluation on train data:',Eval.Ratings.Evaluate(recommender, train_data))

# make a prediction for a certain user and item
print(recommender.Predict(2, 2))

In [34]:
print('latent factors:',recommender.NumFactors,'learn rate:',recommender.LearnRate,
      'regi:',recommender.RegI,'regu:',recommender.RegU,'iterations:',recommender.NumIter)
#       'regularization',recommender.Regularization)

latent factors: 20 learn rate: 0.005 regi: 0.05 regu: 0.05 iterations: 30
