## Import

In [67]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Data preparation

In [68]:
clean_df = pd.read_csv('../data/processed/video_data_processed.csv')

In [69]:
clean_df.sample(5)

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,hour,day,day_of_week,month,year
18419,vZuObCbSRQg,DataCamp,Live Code Along: Machine Learning with XGBoost...,"During this code-along, Lis Sulmont, Workspace...",python|xgboost|data science|coding|data|analyt...,2021-11-16 19:50:19+00:00,5203,140,1,4860,hd,False,19,16,1,11,2021
25377,2ethDz9KnLk,Yannic Kilcher,The hidden dangers of loading open-source AI m...,#huggingface #pickle #exploit \n\nDid you know...,deep learning|machine learning|arxiv|explained...,2022-09-02 20:16:53+00:00,50473,2170,111,1183,hd,False,20,2,4,9,2022
34359,UW83kBhsRxA,Codanics,Book Reading_day1 #datascience #python #datavi...,#datascience #datavisualization #python,data science|books|books to readrdu|hindi|what...,2023-02-16 19:31:01+00:00,833,38,2,2838,hd,False,19,16,3,2,2023
37486,pHYEJOVUsLE,Data Science Tutorials,Power BI Min Dax Function to Find Minimum Valu...,Power BI dax function tutorial for beginners o...,power bi dax function tutorial for beginners|p...,2021-07-26 04:30:08+00:00,3913,14,3,292,hd,False,4,26,0,7,2021
56559,QHX47iGMlUE,The TWIML AI Podcast with Sam Charrington,Trends in Natural Language Processing with Nas...,Today we continue the AI Rewind 2019 joined by...,ai|allennlp|artificial|bert|bias|cognition|dat...,2020-01-09 22:52:00+00:00,1490,20,2,4338,hd,False,22,9,3,1,2020


## Feature engineering

In [70]:
X = clean_df[['viewCount','likeCount', 'commentCount', 'hour', 'day', 'month', 'year']]
# Tạo cột nhãn, ví dụ: nếu viewCount lớn hơn một ngưỡng nào đó, coi video đó là phổ biến
threshold_viewCount = clean_df['viewCount'].quantile(0.7)
clean_df['isTrending'] = (clean_df['viewCount'] > threshold_viewCount).astype(int)
y = clean_df['isTrending']

In [71]:
clean_df.sample(5)


Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,hour,day,day_of_week,month,year,isTrending
31141,qxGxr5ZHy78,DataTrained,Uncovering the Secret to Finding the Perfect I...,#datatrained #InvestmentBanker #howtochooseape...,Investment Banker|Roles of an Investment Banke...,2023-03-28 06:30:24+00:00,48,1,1,140,hd,True,6,28,1,3,2023,0
32756,fLWxBWOt5Ek,FUN WITH DATA SCIENCE,Convert any .mp4 to .gif using python | mini p...,"Hello Everyone, In this video I have told you ...",python programming|python tutorial|learn pytho...,2021-07-24 12:28:18+00:00,116,12,0,281,hd,False,12,24,5,7,2021,0
42839,SosTxTbpZX8,itversity,Cloudera Administration - Google Cloud Platfor...,(nodescription),sqoop cloudera tutorial|hadoop administration ...,2018-10-16 16:48:32+00:00,1915,4,2,840,hd,False,16,16,1,10,2018,0
16391,9spHWKAk3L8,MITCBMM,15 - FMRI Analysis Start to End: Part 5 of 5,"Rick Reynolds, NIMH\n\nFor more information an...",CBMM|Center for Brains Minds and Machines|Arti...,2018-09-14 17:44:48+00:00,469,5,0,1159,hd,True,17,14,4,9,2018,0
25854,VzIO5_R9XEM,DigitalSreeni,253 - Unpaired image to image translation​ usi...,"(No code in this tutorial, please watch the ne...",microscopy|python|image processing,2022-01-19 08:00:08+00:00,9535,199,17,1552,hd,False,8,19,2,1,2022,0


## Training model

In [72]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y ,test_size=0.2)
model = LogisticRegression(solver = 'liblinear',max_iter=1000)
model.fit(X_train, y_train)

## Evaluation

In [73]:
model.score(X_valid,y_valid)

0.9996668609977513

## Prediction

In [74]:
y_pred = model.predict(X_valid)

In [75]:
precision_score(y_valid,y_pred), recall_score(y_valid,y_pred)

(0.9994485800937414, 0.9994485800937414)

In [76]:
print(classification_report(y_valid,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8380
           1       1.00      1.00      1.00      3627

    accuracy                           1.00     12007
   macro avg       1.00      1.00      1.00     12007
weighted avg       1.00      1.00      1.00     12007



# TEMP


In [77]:
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
seed = 2023
models = [
    Lasso(random_state=seed),
    ElasticNet(random_state=seed),
    Ridge(random_state=seed),
    GradientBoostingRegressor(random_state=seed),
    RandomForestRegressor(random_state=seed),
    ExtraTreesRegressor(random_state=seed),
    AdaBoostRegressor(random_state=seed),
    DecisionTreeRegressor(random_state=seed)
]

In [79]:
def generate_baseline_result(models, X, y, metrics, cv=5, plot_result=False):
    # define k-fold
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        scores = cross_val_score(model, X, y, scoring=metrics, cv=kfold)
        for fold_idx, score in enumerate(scores):
            entries.append((model_name, fold_idx, score))

    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'score'])
    
    mean = cv_df.groupby('model_name')['score'].mean()
    std = cv_df.groupby('model_name')['score'].std()
    
    baseline_results = pd.concat([mean,std],axis =1, ignore_index=True)
    baseline_results.columns =['Mean','Standard Deviation']
    baseline_results.sort_values(by=['Mean'], ascending = False, inplace = True)
    
    return baseline_results

# Metric: https://scikit-learn.org/stable/modules/model_evaluation.html
generate_baseline_result(models, X_train, y_train, metrics='neg_mean_absolute_error')

Unnamed: 0_level_0,Mean,Standard Deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
DecisionTreeRegressor,-2.1e-05,4.7e-05
GradientBoostingRegressor,-3.2e-05,4.7e-05
RandomForestRegressor,-3.4e-05,6e-05
AdaBoostRegressor,-4.2e-05,9.3e-05
ExtraTreesRegressor,-0.007557,0.000353
Ridge,-0.396357,0.00045
ElasticNet,-0.403792,0.000254
Lasso,-0.403854,0.000265
