## 1. Importação

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

from lightgbm import LGBMClassifier

In [2]:
df = pd.read_excel('raw_data_with_full_label.xlsx', sheet_name='links', index_col=0).dropna()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452 entries, 0 to 1766
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   link         1452 non-null   object 
 1   title        1452 non-null   object 
 2   canal_nome   1452 non-null   object 
 3   canal_link   1452 non-null   object 
 4   view_counts  1452 non-null   float64
 5   video_date   1452 non-null   object 
 6   like         1452 non-null   object 
 7   dislike      1452 non-null   object 
 8   y            1452 non-null   float64
dtypes: float64(2), object(7)
memory usage: 113.4+ KB


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## 2. Limpeza e tratamento dos dados

In [5]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']
df_limpo['views'] = df['view_counts']

In [6]:
clean_date = df['video_date'].str.extract(r'(\d+) de (\w+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+ x[0] if len(x[0]) == 1 else x)

mapa_meses = {
    'jan': 'Jan',
    'fev': 'Feb',
    'mar': 'Mar',
    'abr': 'Apr',
    'mai': 'May',
    'jun': 'Jun',
    'jul': 'Jul',
    'ago': 'Aug',
    'set': 'Sep',
    'out': 'Oct',
    'nov': 'Nov',
    'dez': 'Dec'
}

clean_date[1] = clean_date[1].map(mapa_meses)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format='%d %b %Y')

In [7]:
indexs = df_limpo['date'].loc[df_limpo['date'].dt.year < 2018].index.values

df_limpo.drop(index=indexs, axis='rows', inplace=True)
df.drop(index=indexs, axis='rows', inplace=True)

In [8]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

features['tempo_desde_pub'] = (pd.to_datetime('2021-03-18') - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features.drop(columns=['tempo_desde_pub'], inplace=True)

In [9]:
qtd = len(df_limpo['date'])
meio = qtd // 2 if qtd % 2 == 0 else (qtd // 2) + 1
date = str(pd.Timestamp(df_limpo.sort_values(by='date')['date'].values[meio]).date())
date

'2020-03-02'

In [10]:
mask_train = (df_limpo['date'] < date)
mask_val = (df_limpo['date'] >= date)

X_train, X_val = features[mask_train], features[mask_val]
y_train, y_val = y[mask_train], y[mask_val]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((645, 2), (662, 2), (645,), (662,))

## 3. Criação de feature

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,4))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [12]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([X_train, title_bow_train])
Xval_wtitle = hstack([X_val, title_bow_val])

In [13]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((645, 1550), (662, 1550))

## 4. Random Forest

In [14]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight='balanced', n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [15]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:,1]

In [16]:
average_precision_score(y_val, p_rf), roc_auc_score(y_val, p_rf)

(0.2701322613168307, 0.7720751633986929)

## 5. LGBM

In [17]:
params = [0.011144682437501488, 2, 5, 0.890697915175696, 0.06794603708438668, 246, 3, 4]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([X_train, title_bow_train])
Xval_wtitle = hstack([X_val, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators, random_state=0, class_weigth='balanced', n_jobs=6)

mdl_lgbm.fit(Xtrain_wtitle, y_train)
p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]



In [18]:
average_precision_score(y_val, p_lgbm), roc_auc_score(y_val, p_lgbm)

(0.2659388454666947, 0.7566993464052287)

## 6. Logistic Regression

In [19]:
from sklearn.pipeline import make_pipeline

In [30]:
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,4))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([X_train, title_bow_train])
Xval_wtitle = hstack([X_val, title_bow_val])

mld_pipe = make_pipeline(MaxAbsScaler(), LogisticRegression(C=10, n_jobs=6, random_state=0))
mld_pipe.fit(Xtrain_wtitle2, y_train)
p_lr = mld_pipe.predict_proba(Xval_wtitle2)[:, 1]

In [31]:
average_precision_score(y_val, p_lr), roc_auc_score(y_val, p_lr)

(0.26927648885823324, 0.7180718954248365)

## 7. Ensemble 

(0.2701322613168307, 0.7720751633986929) - RF

(0.2659388454666947, 0.7566993464052287) - LGBM

(0.2680535000340579, 0.7138888888888889) - LR

In [32]:
p = (p_rf + p_lgbm + p_lr) / 3
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.3010721705884503, 0.7786928104575164)

In [33]:
pd.DataFrame({'LR': p_lr, 'RF': p_rf, 'LGBM': p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.709387,0.606654
RF,0.709387,1.0,0.79371
LGBM,0.606654,0.79371,1.0


In [60]:
p = 0.3 * p_lr + 0.7 * p_lgbm
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.30707416539893634, 0.7704901960784313)

(0.29405180130052905, 0.7679084967320261) - 0.5 * p_lr + 0.5 * p_rf
(0.29457324235808463, 0.7836928104575163) - 0.5 * p_rf + 0.5 * p_lgbm
(0.29828285365305823, 0.7614705882352941) - 0.5 * p_lr + 0.5 * p_lgbm
(0.2984451868245137, 0.7752614379084968) - 0.2 * p_lr + 0.8 * p_rf
(0.30002086336716705, 0.7723202614379086) - 0.25 * p_lr + 0.75 * p_lgbm
(0.30707416539893634, 0.7704901960784313) - 0.3 * p_lr + 0.7 * p_lgbm