In [None]:
import pandas as pd 
import numpy as np
import os
import sklearn
import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor

from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from engagement_utils import *

post_data = pd.read_csv('/workspaces/Crowdfunding-Social-Media-Drivers/Data/Original_dataset/processed_data.csv')
features = [x for x in post_data.columns if x not in target_cols]

features = post_data[features]
targets = post_data[target_cols]

processed_features = pre_process(features)
processed_targets = process_targets(targets)
x_train, x_test, y_train, y_test = train_test_split(processed_features, processed_targets, test_size=0.2, random_state=42)

y_test = decode_targets(y_test)

In [None]:
#xgb training
xgb_param_grid = {
    'n_estimators': [1000, 5000],
    'learning_rate': [0.05, 0.001, 0.01, 0.1],
    'max_depth': [3, 5, 10],
    'random_state': [42],
    'n_jobs': [-1]
}

xgb_grid_search = GridSearchCV(
    xgb.XGBRegressor(),
    xgb_param_grid, cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=2, n_jobs=-1
    )

xgb_grid_search.fit(x_train, y_train)

xgb_best_params = xgb_grid_search.best_params_
xgb_best_estimator = xgb_grid_search.best_estimator_

pprint(xgb_best_params)

xgb_best_estimator.fit(x_train, y_train)
save_model(xgb_best_estimator, 'xgBoost')

xgb_pred = xgb_best_estimator.predict(x_test)
xgb_pred = decode_targets(xgb_pred).astype(int)

xgb_scores = evaluate_model_performance(y_test, xgb_pred)
xgb_scores

In [None]:
estimators = [1000, 2000]
features = [0.33, 1, 'sqrt']
samples_leaf = [1,5]
bootstrap = [True]
min_samples_split = [2, 5, 10]


random_grid = {'n_estimators': estimators,
               'max_features': features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': samples_leaf,
               'bootstrap': bootstrap}

rf_grid_search = GridSearchCV(
    RandomForestRegressor(criterion="poisson", random_state=42), 
    random_grid, cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=2, n_jobs=-1
    )

rf_grid_search.fit(x_train, y_train)

rf_best_params = rf_grid_search.best_params_
rf_best_estimator = rf_grid_search.best_estimator_

rf_best_estimator.fit(x_train, y_train)
save_model(rf_best_estimator, 'RandomForest')
rf_pred = rf_best_estimator.predict(x_test)
rf_pred = decode_targets(rf_pred).astype(int)

evaluate_model_performance(y_test, rf_pred)

In [None]:
xgb_best_estimator = load_model('xgBoost')
save_all_plots(xgb_best_estimator, x_test, y_test)

In [None]:
topic_idx2label = {
    '0':	'Awards and recognitions',
    '1':	'Portable Tech',
    '2':	'Movies and Entertainment',
    '3':	'LifeTech Innovations',
    '4':	'Cultural connections',
    '5':	'Humanitarian',
    '6':	'Media and Events',
    '7':	'Digital and Social trends',
    '8':	'Household Tech',
    '9':	'Small Business',
    '10':	'Tech and Travel Gear'
}