# IMPORTS

## Libraries

In [1]:
import warnings

import pandas as pd
import numpy  as np

import matplotlib.pyplot as plt

from skopt                           import forest_minimize
from lightgbm                        import LGBMClassifier
from scipy.sparse                    import hstack, csr_matrix
from sklearn.metrics                 import roc_auc_score, average_precision_score
from sklearn.ensemble                import RandomForestClassifier
from sklearn.linear_model            import LogisticRegression
from sklearn.preprocessing           import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer


np.random.seed(0)
%matplotlib inline
pd.set_option('display.max_columns', 200)
warnings.filterwarnings("ignore")

# Load Data

In [2]:
dfRaw = pd.read_feather('../Data/FeatherData/dfAllDataLabeled.feather')

In [3]:
dfRaw = dfRaw.sort_values('UploadDate')

# Feature Engineering

In [4]:
dfRaw.shape

(1773, 21)

In [5]:
dfFeatures = pd.DataFrame(index=dfRaw.index)
dfFeatures['ViewCount'] = dfRaw['ViewCount']
dfFeatures['DaysSincePublication'] = dfRaw['DaysSincePublication']
dfFeatures['WatchList'] = dfRaw['WatchList']
dfFeatures['ViewsPerDay'] = dfFeatures['ViewCount'] / dfFeatures['DaysSincePublication']
dfFeatures = dfFeatures.drop('DaysSincePublication', axis=1)

# Split DataFrame into Training and Validation Dataset

In [6]:
dfRaw['UploadDate'].iloc[int(round(dfRaw.shape[0]/2,0))]

Timestamp('2019-11-27 00:00:00')

In [7]:
dateSplit = '2019-11-27'

maskTrain = (dfRaw['UploadDate'] < dateSplit)

maskVal = (dfRaw['UploadDate'] >= dateSplit)

Xtrain, Xval = dfFeatures[maskTrain].drop('WatchList', axis=1) , dfFeatures[maskVal].drop('WatchList', axis=1)
ytrain, yval = dfFeatures[maskTrain]['WatchList'], dfFeatures[maskVal]['WatchList']

In [8]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((885, 2), (888, 2), (885,), (888,))

# Text Features

In [9]:
titleTrain = dfRaw[maskTrain]['Title']
titleVal = dfRaw[maskVal]['Title']

titleVec = TfidfVectorizer(min_df=4, ngram_range=(1,2))
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

In [10]:
titleBowTrain.shape

(885, 495)

In [11]:
titleBowTrain

<885x495 sparse matrix of type '<class 'numpy.float64'>'
	with 8627 stored elements in Compressed Sparse Row format>

In [12]:
XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

In [13]:
XtrainWTitle.shape, XvalWTitle.shape

((885, 497), (888, 497))

# Model

## RandomForestClassifier

In [87]:
rf = RandomForestClassifier(n_estimators=1500, random_state=0, min_samples_leaf=5, class_weight='balanced', n_jobs=-1)
rf.fit(XtrainWTitle, ytrain)

RandomForestClassifier(class_weight='balanced', min_samples_leaf=5,
                       n_estimators=1500, n_jobs=-1, random_state=0)

In [88]:
pRf = rf.predict_proba(XvalWTitle)[:,1]

### Model Evaluate 

In [89]:
print(f'ap: {average_precision_score(yval, pRf)}, auc: {roc_auc_score(yval, pRf)}')

ap: 0.3844695160599161, auc: 0.7792372881355933


In [19]:
# ap: 0.3521564476834772, auc: 0.7758089368258859 | min_samples_leaf=1
# ap: 0.3843011851670597, auc: 0.7815540391811577 | min_samples_leaf=2
# ap: 0.38551950447476385, auc: 0.7831168831168831 | n_estimators=1500
# ap: 0.40864867634952645, auc: 0.7861765353290777 | min_df=3
# ap: 0.4138684826920506, auc: 0.784668721109399 | ngram_range = (1,2)
# ap: 0.425697264360267, auc: 0.7845256438476778 | min_df=4 & ngram_range = (1,2)

### Bayesian Optimization

In [92]:
def tuneRf(params):
    print(params)
    max_depth = params[0]
    min_samples_leaf = params[1]
    n_estimators = params[2]
    min_df = params[3]
    ngram_range = (1, params[4])
    
    titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    titleBowTrain = titleVec.fit_transform(titleTrain)
    titleBowVal = titleVec.transform(titleVal)
    
    XtrainWTitle = hstack([Xtrain, titleBowTrain])
    XvalWTitle = hstack([Xval, titleBowVal])
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=0,
                                   class_weight='balanced', n_jobs=-1)
    
    model.fit(XtrainWTitle, ytrain)
    
    p = model.predict_proba(XvalWTitle)[:,1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval,p)



space = [(1, 10), #max_depth
        (1, 20), #min_samples_leaf 
        (100, 1000), #n_estimetors
        (1, 5), #min_df
        (1, 5)] #ngram_range



result = forest_minimize(tuneRf, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[5, 16, 771, 1, 3]
0.7517499449702839
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 3.7798
Function value obtained: -0.3635
Current minimum: -0.3635
Iteration No: 2 started. Evaluating function at random point.
[8, 9, 272, 3, 1]
0.7781642086726832
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.3968
Function value obtained: -0.3853
Current minimum: -0.3853
Iteration No: 3 started. Evaluating function at random point.
[2, 10, 804, 3, 4]
0.7796830288355712
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 3.8756
Function value obtained: -0.3817
Current minimum: -0.3853
Iteration No: 4 started. Evaluating function at random point.
[6, 2, 110, 4, 5]
0.7944034778780541
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.6096
Function value obtained: -0.4139
Current minimum: -0.4139
Iteration No: 5 started. Evaluating function at random point.
[6, 20, 

In [93]:
result.x

[8, 19, 189, 5, 3]

### Model With Hyperparameter Tuning

In [310]:
params = [8, #max_depth
 19, #min_samples_leaf 
 840, #n_estimetors
 3,#min_df
 5] #ngram_range


max_depth = params[0]
min_samples_leaf = params[1]
n_estimators = params[2]
min_df = params[3]
ngram_range = (1, params[4])


titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

modelRf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=0,
                               class_weight='balanced', n_jobs=-1)

modelRf.fit(XtrainWTitle, ytrain)

pRf = modelRf.predict_proba(XvalWTitle)[:,1]


print(f'ap: {average_precision_score(yval,pRf)}, auc: {roc_auc_score(yval, pRf)}')

ap: 0.43734417259145686, auc: 0.788895003301783


## LGBMClassifier

In [20]:
modelLGBM = LGBMClassifier(random_state=0, class_weight='balanced', n_jobs=-1)
modelLGBM.fit(XtrainWTitle, ytrain)

LGBMClassifier(class_weight='balanced', random_state=0)

In [21]:
pLGBM = modelLGBM.predict_proba(XvalWTitle)[:,1]

### Model Evaluate 

In [22]:
print(f'ap: {average_precision_score(yval,pLGBM)}, auc: {roc_auc_score(yval, pLGBM)}')

ap: 0.3613860826479734, auc: 0.7497248514197665


### Bayesian Optimization

In [240]:
def tuneLGBM(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    
    titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    titleBowTrain = titleVec.fit_transform(titleTrain)
    titleBowVal = titleVec.transform(titleVal)
    
    XtrainWTitle = hstack([Xtrain, titleBowTrain])
    XvalWTitle = hstack([Xval, titleBowVal])
    
    model = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                           min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree,
                           bagging_freq=1, n_estimators=n_estimators, random_state=0, class_weight='balanced', n_jobs=-1)
    
    model.fit(XtrainWTitle, ytrain)
    
    p = model.predict_proba(XvalWTitle)[:,1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval,p)



space = [(1e-3, 1e-1, 'log-uniform'), #lr
        (1, 10), #max_depth
        (1, 20), #min_child_samples
        (0.05, 1.0), #subsample
        (0.05, 1.0), #colsample_bytree
        (100, 1000), #n_estimetors
        (1, 5), #min_df
        (1, 5)] #ngram_range



result = forest_minimize(tuneLGBM, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]
0.7713295179396874
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6183
Function value obtained: -0.3746
Current minimum: -0.3746
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]
0.7705040721989873
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2485
Function value obtained: -0.3512
Current minimum: -0.3746
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]
0.7800187101034559
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3840
Function value obtained: -0.3856
Current minimum: -0.3856
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6

In [25]:
result.x

[0.001477438108754858,
 3,
 10,
 0.16137221087324535,
 0.19593296243176872,
 966,
 3,
 5]

##### LGBMClassifier

- [0.001477438108754858, 3, 10, 0.16137221087324535, 0.19593296243176872, 966, 3, 5]
- ap: 0.4227, auc: 0.7806845696676206

## Model With Hyperparameter Tuning

In [309]:
params = [0.001477438108754858, #lr
 3, #max_depth
 10, #min_child_samples
 0.16137221087324535, #subsample
 0.19593296243176872, #colsample_bytree
 966, #n_estimetors
 3,  #min_df
 5] #ngram_range



lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
min_df = params[6]
ngram_range = (1, params[7])

titleVec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

modelLgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                       min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree,
                       bagging_freq=1, n_estimators=n_estimators, random_state=0, class_weight='balanced', n_jobs=-1)

modelLgbm.fit(XtrainWTitle, ytrain)

pLgbm = modelLgbm.predict_proba(XvalWTitle)[:,1]


print(f'ap: {average_precision_score(yval,pLgbm)}, auc: {roc_auc_score(yval, pLgbm)}')

#ap: 0.4211858121788448, auc: 0.7803543913713407

ap: 0.42266615524449913, auc: 0.7806845696676206


# Logistic Regression

### Data Preparation

In [247]:
XtrainWTitle2 = csr_matrix(XtrainWTitle.copy())
XvalWTitle2 = csr_matrix(XvalWTitle.copy())


scaler = MaxAbsScaler()

XtrainWTitle2 = scaler.fit_transform(XtrainWTitle2)
XvalWTitle2 = scaler.transform(XvalWTitle2)

In [248]:
XvalWTitle2.shape

(888, 1139)

### Model

In [249]:
modelLr = LogisticRegression(C=0.01, n_jobs=-1, random_state=0)
modelLr.fit(XtrainWTitle2, ytrain)

pLr = modelLr.predict_proba(XvalWTitle2)[:,1]

In [250]:
print(f'ap: {average_precision_score(yval, pLr)}, auc: {roc_auc_score(yval, pLr)}')

ap: 0.3986324868099709, auc: 0.7804974686330618


# Ensamble

#### min_df > 3
- LGBM | ap: 0.42266615524449913, auc: 0.7806845696676206
- LR   | ap: 0.3986324868099709, auc: 0.7804974686330618
- RF   | ap: 0.43734417259145686, auc: 0.788895003301783

#### min_df > 2
- LGBM | ap: 0.41114784456628284, auc: 0.7786154523442659
- LR   | ap: 0.3809915480394665, auc: 0.7685009905348889
- RF   | ap: 0.4035174762881823, auc: 0.7769480519480519

In [262]:
pd.DataFrame({'RF': pRf,'LR': pLr, 'LGBM': pLgbm}).corr()

Unnamed: 0,RF,LR,LGBM
RF,1.0,0.830465,0.91973
LR,0.830465,1.0,0.820297
LGBM,0.91973,0.820297,1.0


In [315]:
p = (0.57*pRf + 0.43*pLgbm)
print(f'ap: {average_precision_score(yval,p)}, auc: {roc_auc_score(yval, p)}')

# ap: 0.4325093482032139, auc: 0.7893132291437376
# ap: 0.4333159558286226, auc: 0.7887739379264804

ap: 0.4330701963522659, auc: 0.7886198547215497


# Save Model

In [136]:
import joblib as jb

In [None]:
# jb.dump(modelRf,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelRf.pkl.z')

# jb.dump(modelRf,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelRf.pkl.z')
# jb.dump(modelLgbm,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelLgbm.pkl.z')
# jb.dump(modelLr,'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Model/modelLgbm.pkl.z')