# IMPORTS

## Libraries

In [1]:
import warnings

import pandas as pd
import numpy  as np
import joblib as jb

import matplotlib.pyplot as plt

from skopt                           import forest_minimize
from lightgbm                        import LGBMClassifier
from scipy.sparse                    import hstack, csr_matrix
from sklearn.metrics                 import roc_auc_score, average_precision_score
from sklearn.ensemble                import RandomForestClassifier
from sklearn.linear_model            import LogisticRegression
from sklearn.preprocessing           import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer


np.random.seed(0)
%matplotlib inline
pd.set_option('display.max_columns', 200)
warnings.filterwarnings("ignore")

# Load Data

In [2]:
dfGS = pd.read_feather('../Data/FeatherData/dfAllDataLabeled.feather')
dfStack = pd.read_csv('../Data/StackTags/dfTags.csv')
for col in dfStack.columns:
    dfStack = dfStack.rename(columns={col: col + "Stack"})

In [3]:
dfRaw = pd.merge(dfGS, dfStack, how='left', left_on='Query', right_on='TagStack')
dfRaw = dfRaw.sort_values('UploadDate')

In [4]:
dfRaw.head()

Unnamed: 0,WatchList,Id,Uploader,UploadDate,Title,Description,Categories,Tags,Duration,ViewCount,LikeCount,DislikeCount,AverageRating,Query,DaysSincePublication,ThumbnailUrl,ThumbnailWidth,ThumbnailHeight,Tags.1,New,TagStack,IncidenceStack,VotesStack,AnswerStack,ViewsStack,ViewsPerIncidenceStack
0,0,fFPsjpP5Shs,PyData,2014-04-03,Linda Uruchurtu: A Beginner's Guide to Random ...,Linda Uruchurtu: A Beginner's Guide to,People & Blogs,"Python (Software), Random Forest, R (Programmi...",2277,38,266,54,4.325,random-forest,2465,https://i.ytimg.com/vi/fFPsjpP5Shs/hqdefault.j...,168,94,,0.0,random-forest,99.0,281.0,100.0,282108.0,2849.575758
1473,0,Y7avOZl97ls,edureka!,2014-09-05,Random Forest Classifier | Data Science | Edureka,Data Science Training - https://www.edureka.co...,Education,,711,8,8,10,2.777778,random-forest,2310,https://i.ytimg.com/vi/Y7avOZl97ls/hqdefault.j...,168,94,"Data Science, R Programming, random forest in ...",1.0,random-forest,99.0,281.0,100.0,282108.0,2849.575758
1,1,luecsqQ636c,Montreal-Python,2014-11-20,Automating microscopy data analysis using Djan...,"Montreal, Oct. 20, 2014 - Eleyine Zarour demon...",Science & Technology,"montreal, python, technology, dev, code, confe...",1047,3,52,3,4.781818,scipy,2234,https://i.ytimg.com/vi/luecsqQ636c/hqdefault.j...,168,94,,0.0,scipy,96.0,201.0,99.0,235177.0,2449.760417
2,0,3XCatMUuGDo,ESAC Data Analysis and Statistics,2014-12-05,Jake VanderPlas: Bayesian model validation and...,Tutorial by Jake VanderPlas at the ESAC Data A...,People & Blogs,"ESAC, Data Analysis (Media Genre), Statistics ...",1254,1,6,2,4.428571,cross-validation,2219,https://i.ytimg.com/vi/3XCatMUuGDo/hqdefault.j...,168,94,,0.0,cross-validation,59.0,235.0,65.0,241737.0,4097.237288
3,1,KTeVOb8gaD4,sentdex,2014-12-22,Scikit Learn Machine Learning SVM Tutorial wit...,"In this machine learning tutorial, we cover a ...",Education,"Scikit-learn, Python (Programming Language), M...",1427,303,2303,72,4.878737,svm,2202,https://i.ytimg.com/vi/KTeVOb8gaD4/hqdefault.j...,168,94,,0.0,svm,71.0,234.0,86.0,372999.0,5253.507042


# Feature Engineering

In [5]:
dfRaw.shape

(1773, 26)

In [6]:
dfFeatures = pd.DataFrame(index=dfRaw.index)
#dfFeatures['ViewCount'] = dfRaw['ViewCount']
#dfFeatures['DaysSincePublication'] = dfRaw['DaysSincePublication']
dfFeatures['WatchList'] = dfRaw['WatchList']
dfFeatures['ViewsPerDay'] = dfRaw['ViewCount'] / dfRaw['DaysSincePublication']
#dfFeatures = dfFeatures.drop('DaysSincePublication', axis=1)


dfFeatures['Duration'] = dfRaw['Duration']
## Stack

dfFeatures['ViewCountIncidenceStack'] = dfRaw['ViewCount'] / dfRaw['IncidenceStack']
dfFeatures['ViewCountIncidenceStack'].fillna(dfFeatures['ViewCountIncidenceStack'].mean(), inplace=True)

dfFeatures['VotesStackLikeCount'] = dfRaw['VotesStack'] * dfRaw['LikeCount']
dfFeatures['VotesStackLikeCount'].fillna(dfFeatures['VotesStackLikeCount'].mean(), inplace=True)

# Split DataFrame into Training and Validation Dataset

In [7]:
dfRaw['UploadDate'].iloc[int(round(dfRaw.shape[0]/2,0))]

Timestamp('2019-11-27 00:00:00')

In [11]:
dateSplit = '2019-11-27'

maskTrain = (dfRaw['UploadDate'] < dateSplit)

maskVal = (dfRaw['UploadDate'] >= dateSplit)

Xtrain, Xval = dfFeatures[maskTrain].drop('WatchList', axis=1) , dfFeatures[maskVal].drop('WatchList', axis=1)
ytrain, yval = dfFeatures[maskTrain]['WatchList'], dfFeatures[maskVal]['WatchList']

titleTrain = dfRaw[maskTrain]['Title']
titleVal = dfRaw[maskVal]['Title']

In [12]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((885, 4), (888, 4), (885,), (888,))

# Model

## RandomForestClassifier

### Bayesian Optimization

In [13]:
def tuneRf(params):
    print(params)
    max_depth = params[0]
    min_samples_leaf = params[1]
    n_estimators = params[2]
    min_df = params[3]
    ngram_range = (1, params[4])
    
    titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    titleBowTrain = titleVec.fit_transform(titleTrain)
    titleBowVal = titleVec.transform(titleVal)
    
    XtrainWTitle = hstack([Xtrain, titleBowTrain])
    XvalWTitle = hstack([Xval, titleBowVal])
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=0,
                                   class_weight='balanced', n_jobs=-1)
    
    model.fit(XtrainWTitle, ytrain)
    
    p = model.predict_proba(XvalWTitle)[:,1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval,p)



space = [(1, 10), #max_depth
        (1, 20), #min_samples_leaf 
        (100, 1000), #n_estimetors
        (1, 5), #min_df
        (1, 5)] #ngram_range



result = forest_minimize(tuneRf, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[5, 16, 771, 1, 3]
0.8350704380365397
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.1595
Function value obtained: -0.4903
Current minimum: -0.4903
Iteration No: 2 started. Evaluating function at random point.
[8, 9, 272, 3, 1]
0.8699537750385209
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.4060
Function value obtained: -0.5348
Current minimum: -0.5348
Iteration No: 3 started. Evaluating function at random point.
[2, 10, 804, 3, 4]
0.847672243011226
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.1952
Function value obtained: -0.5014
Current minimum: -0.5348
Iteration No: 4 started. Evaluating function at random point.
[6, 2, 110, 4, 5]
0.8639500330178297
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2321
Function value obtained: -0.5256
Current minimum: -0.5348
Iteration No: 5 started. Evaluating function at random point.
[6, 20, 7

In [14]:
result.x

[5, 18, 488, 4, 2]

##### RandomForestClassifier

- [5, 18, 488, 4, 2]
- ap: 0.5685, auc: 0.875544794188862

### Model With Hyperparameter Tuning

In [23]:
params = [5, #max_depth
 18, #min_samples_leaf 
 489, #n_estimetors
 4,#min_df
 2] #ngram_range


max_depth = params[0]
min_samples_leaf = params[1]
n_estimators = params[2]
min_df = params[3]
ngram_range = (1, params[4])


titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

modelRf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=0,
                               class_weight='balanced', n_jobs=-1)

modelRf.fit(XtrainWTitle, ytrain)

pRf = modelRf.predict_proba(XvalWTitle)[:,1]
#ap: 0.5686275377746386, auc: 0.8754897644728153
print(f'ap: {average_precision_score(yval,pRf)}, auc: {roc_auc_score(yval, pRf)}')

ap: 0.5686275377746386, auc: 0.8754897644728153


## LGBMClassifier

### Bayesian Optimization

In [21]:
def tuneLGBM(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    
    titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    titleBowTrain = titleVec.fit_transform(titleTrain)
    titleBowVal = titleVec.transform(titleVal)
    
    XtrainWTitle = hstack([Xtrain, titleBowTrain])
    XvalWTitle = hstack([Xval, titleBowVal])
    
    model = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                           min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree,
                           bagging_freq=1, n_estimators=n_estimators, random_state=0, class_weight='balanced', n_jobs=-1)
    
    model.fit(XtrainWTitle, ytrain)
    
    p = model.predict_proba(XvalWTitle)[:,1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval,p)



space = [(1e-3, 1e-1, 'log-uniform'), #lr
        (1, 10), #max_depth
        (1, 20), #min_child_samples
        (0.05, 1.0), #subsample
        (0.05, 1.0), #colsample_bytree
        (100, 1000), #n_estimetors
        (1, 5), #min_df
        (1, 5)] #ngram_range



result = forest_minimize(tuneLGBM, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]
0.8823464670922297
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6432
Function value obtained: -0.5671
Current minimum: -0.5671
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]
0.8611160026414264
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2490
Function value obtained: -0.4711
Current minimum: -0.5671
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]
0.8647589698437156
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.4877
Function value obtained: -0.5239
Current minimum: -0.5671
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6

In [22]:
result.x

[0.0015212976972079912,
 3,
 12,
 0.44234694306528044,
 0.399351303640462,
 272,
 3,
 5]

##### LGBMClassifier

- [0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]
- ap: 0.5731, auc: 0.87557781201849

## Model With Hyperparameter Tuning

In [24]:
params = [0.0015212976972079912, #lr
 3, #max_depth
 17, #min_child_samples
 0.44234694306528044, #subsample
 0.399351303640462, #colsample_bytree
 881, #n_estimetors
 4,  #min_df
 2] #ngram_range


lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
min_df = params[6]
ngram_range = (1, params[7])

titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

modelLgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                       min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree,
                       bagging_freq=1, n_estimators=n_estimators, random_state=0, class_weight='balanced', n_jobs=-1)

modelLgbm.fit(XtrainWTitle, ytrain)

pLgbm = modelLgbm.predict_proba(XvalWTitle)[:,1]


print(f'ap: {average_precision_score(yval,pLgbm)}, auc: {roc_auc_score(yval, pLgbm)}')

ap: 0.5752235771400988, auc: 0.8819282412502751


# Logistic Regression

# Ensamble

- LGBM | ap: 0.5752235771400988, auc: 0.8819282412502751
- RF   | ap: 0.5686275377746386, auc: 0.8754897644728153

In [25]:
pd.DataFrame({'RF': pRf, 'LGBM': pLgbm}).corr()

Unnamed: 0,RF,LGBM
RF,1.0,0.923365
LGBM,0.923365,1.0


In [26]:
p = (0.5*pRf + 0.5*pLgbm)
print(f'ap: {average_precision_score(yval,p)}, auc: {roc_auc_score(yval, p)}')


ap: 0.5884758482319418, auc: 0.8839423288575831


# Save Model

In [29]:
jb.dump(modelRf,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelRf.pkl.z')
jb.dump(modelLgbm,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelLgbm.pkl.z')
jb.dump(titleVec,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Parameter/titleVec.pkl.z')

['D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Parameter/titleVec.pkl.z']