In [1]:
import pandas as pd 
import numpy as np
import os
import sklearn
import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.multioutput import MultiOutputRegressor
from test_diff_models import *
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from engagement_utils import *

warnings.filterwarnings('ignore')
post_data = pd.read_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/Original_dataset/processed_data.csv')
post_data.drop('original_index', axis=1, inplace=True)
features = [x for x in post_data.columns if x not in target_cols]

features = post_data[features]
targets = post_data[target_cols]

processed_features = pre_process(features)
processed_targets = process_targets(targets)
x_train, x_test, y_train, y_test = train_test_split(processed_features, processed_targets, test_size=0.2, random_state=42)

y_test = decode_targets(y_test)

FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/Crowdfunding-Social-Media-Drivers/Data/Original_dataset/processed_data.csv'

In [23]:
reg_test = RegressionModels(x_train, x_test, y_train.iloc[:,0], y_test.iloc[:,0])
reg_test.run_evaluation()
reg_test.top_10_socres

Training models: 100%|██████████| 40/40 [01:50<00:00,  2.77s/it]
Evaluating models: 100%|██████████| 39/39 [00:02<00:00, 13.30it/s]


Unnamed: 0,RMSE,MAE,R2,MSLE,EV Score
RandomForestRegressor,34.43,14.857,0.661,0.167,0.67
HistGradientBoostingRegressor,34.679,14.939,0.656,0.169,0.665
BaggingRegressor,34.749,14.955,0.654,0.179,0.665
XGBRegressor,35.07,15.407,0.648,0.189,0.654
GradientBoostingRegressor,37.823,16.409,0.591,0.208,0.606
KNeighborsRegressor,38.652,17.315,0.573,0.266,0.582
ExtraTreesRegressor,39.998,17.849,0.542,0.239,0.56
AdaBoostRegressor,42.439,19.327,0.485,0.295,0.502
DecisionTreeRegressor,43.513,18.941,0.458,0.355,0.458
ARDRegression,53.035,26.959,0.195,0.694,0.244


In [25]:
param_grid = {
    'estimator__learning_rate': [0.1, 0.01, 0.05],
    'estimator__max_iter': [100, 200, 300],
    'estimator__max_depth': [None, 3, 5],
    'estimator__random_state': [42]
}

hist_grid_search = GridSearchCV(
    MultiOutputRegressor(HistGradientBoostingRegressor()),
    param_grid, cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=2, n_jobs=-1
)

hist_grid_search.fit(x_train, y_train)

hist_best_params = hist_grid_search.best_params_
hist_best_estimator = hist_grid_search.best_estimator_

pprint(hist_best_params)

hist_best_estimator.fit(x_train, y_train)
save_model(hist_best_estimator, 'HistGradientBoost')

hist_pred = hist_best_estimator.predict(x_test)
hist_pred = decode_targets(hist_pred).astype(int)

hist_scores = evaluate_model_performance(y_test, hist_pred)
hist_scores

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=100, estimator__random_state=42; total time=   4.4s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=100, estimator__random_state=42; total time=   4.4s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=100, estimator__random_state=42; total time=   4.4s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=100, estimator__random_state=42; total time=   4.4s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=100, estimator__random_state=42; total time=   4.6s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=200, estimator__random_state=42; total time=   8.3s
[CV] END estimator__learning_rate=0.1, estimator__max_depth=None, estimator__max_iter=200, estimator__random_s

Unnamed: 0,RMSE,R2,MSLE,EV Score
likes,34.753422,0.65439,0.166498,0.666743
shares,16.302093,0.456943,0.515671,0.483227
comments,24.027714,0.590026,0.507668,0.603402
positive_reactions,18.340882,0.535504,0.457282,0.554334
negative_reactions,43.634842,0.188065,0.89412,0.211936


In [28]:
param_grid = {
    'n_estimators': [500, 1000],
    'max_features': [1, 'sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'random_state': [42]
}

rf_grid_search = GridSearchCV(
    RandomForestRegressor(criterion="poisson", random_state=42), 
    param_grid, cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=3, n_jobs=-1
    )

rf_grid_search.fit(x_train, y_train)

rf_best_params = rf_grid_search.best_params_
rf_best_estimator = rf_grid_search.best_estimator_

rf_best_estimator.fit(x_train, y_train)
save_model(rf_best_estimator, 'RandomForest')
rf_pred = rf_best_estimator.predict(x_test)
rf_pred = decode_targets(rf_pred).astype(int)

evaluate_model_performance(y_test, rf_pred)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=500, random_state=42;, score=-0.873 total time=   7.3s
[CV 1/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=500, random_state=42;, score=-0.860 total time=   7.2s
[CV 4/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=500, random_state=42;, score=-0.852 total time=   7.4s
[CV 3/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=500, random_state=42;, score=-0.875 total time=   7.4s
[CV 5/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=500, random_state=42;, score=-0.822 total time=   7.1s
[CV 1/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=42;, score=-0.860 total time=  14.6s
[CV 2/5] END max_features=1, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=42;, sc

Unnamed: 0,RMSE,R2,MSLE,EV Score
likes,45.329096,0.412043,0.313618,0.452447
shares,19.404912,0.230548,0.587398,0.282567
comments,29.296647,0.390509,0.572882,0.423321
positive_reactions,22.33594,0.311109,0.553171,0.3532
negative_reactions,47.434147,0.040519,0.91375,0.070088


In [3]:
model = joblib.load('/workspaces/Crowdfunding-Social-Media-Drivers/Modelling/Enagement_prediction/HistGradientBoost.pkl')
model

In [11]:
topic_cols = [x for x in post_data.columns if 'topic' in x]
text_complexity_cols = ['readability','readability_lix', 'entropy_scores', 'perplexity_scores']
text_emotionality_cols = ['fear','anger', 'anticip', 'trust', 'surprise', 'positive', 'negative', 'sadness', 'disgust', 'joy']
post_data.columns

Index(['page_name', 'likes_at_posting', 'followers_at_posting', 'type',
       'likes', 'comments', 'shares', 'post_views', 'readability',
       'readability_lix', 'entropy_scores', 'perplexity_scores', 'fear',
       'anger', 'anticip', 'trust', 'surprise', 'positive', 'negative',
       'sadness', 'disgust', 'joy', 'entities_identified', 'post_sponsored',
       'post_age', 'page_age', 'positive_reactions', 'negative_reactions',
       'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5',
       'topic_6', 'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11'],
      dtype='object')

In [None]:
save_partial_dependence_plots(model, 
                            topic_cols,
                            x_test, 
                            feature_name = 'topics', 
                            categirical_features=None)

In [None]:
save_partial_dependence_plots(model, 
                            text_complexity_cols,
                            x_test, 
                            feature_name = 'text_complexity_cols', 
                            categirical_features=None)

In [None]:
save_partial_dependence_plots(model, 
                            text_emotionality_cols,
                            x_test, 
                            feature_name = 'emotionality', 
                            categirical_features=None)