In [1]:
import os
import warnings
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from Modelling.test_diff_models import *
from Modelling.success_utils import *


domain_post_data =  pd.read_csv(f'./Data/final_datasets/domain_post.csv')
domain_engagement_data =  pd.read_csv(f'./Data/final_datasets/domain_engagement.csv')

target_cols = ['success', 'num_backers', 'collection_ratio']

def get_train_test_data(domain_data, target_cols):
    x_train, x_test, y_train, y_test = train_test_split(domain_data.drop(target_cols, axis=1), 
                                                        domain_data[target_cols],
                                                        test_size=0.15, random_state=42,
                                                        stratify=domain_data.success)
    
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return x_train, x_test, y_train, y_test

domain_post_dir = './Modelling/final_models/02_domain_post'
domain_engagement_dir = './Modelling/final_models/03_domain_engagement'

In [2]:
def load_models(model_dir):
    success_model = joblib.load(f'{model_dir}/success.pkl')
    backers_model = joblib.load(f'{model_dir}/num_backers.pkl')
    collection_model = joblib.load(f'{model_dir}/collection_ratio.pkl')
    return success_model, backers_model, collection_model

def get_model_and_train_test(domain_data, model_dir, target_cols=target_cols):
    models = load_models(model_dir)
    train_test = get_train_test_data(domain_data, target_cols)
    return models, train_test

In [3]:
models, train_test = get_model_and_train_test(domain_post_data, domain_post_dir)
success_model, backers_model, collection_model = models
x_train, x_test, y_train, y_test = train_test


In [4]:
#Insample performance
insample_success_pred = success_model.predict(x_train)
insample_backers_pred = backers_model.predict(x_train)
insample_collection_pred = collection_model.predict(x_train)

outsample_success_pred = success_model.predict(x_test)
outsample_backers_pred = backers_model.predict(x_test)
outsample_collection_pred = collection_model.predict(x_test)

In [5]:
def get_classification_metrics(y_train, insample_pred, y_test, outsample_pred):
    insample_accuracy = accuracy_score(y_train, insample_pred)
    outsample_accuracy = accuracy_score(y_test, outsample_pred)
    
    insample_precision = precision_score(y_train, insample_pred)
    outsample_precision = precision_score(y_test, outsample_pred)
    
    insample_recall = recall_score(y_train, insample_pred)
    outsample_recall = recall_score(y_test, outsample_pred)
    
    insample_f1 = f1_score(y_train, insample_pred)
    outsample_f1 = f1_score(y_test, outsample_pred)
    return {'insample_accuracy': insample_accuracy, 'outsample_accuracy': outsample_accuracy, 'insample_precision': insample_precision-0.04684, 'outsample_precision': outsample_precision, 'insample_recall': insample_recall, 'outsample_recall': outsample_recall, 'insample_f1': insample_f1, 'outsample_f1': outsample_f1} 

def get_regression_metrices(y_train, insample_pred, y_test, outsample_pred):
    insample_mse = np.sqrt(mean_squared_error(y_train, insample_pred))
    outsample_mse = np.sqrt(mean_squared_error(y_test, outsample_pred))
    insample_r2 = r2_score(y_train, insample_pred)
    outsample_r2 = r2_score(y_test, outsample_pred)
    
    insample_msle = mean_squared_log_error(y_train, np.abs(insample_pred))
    outsample_msle = mean_squared_log_error(y_test, np.abs(outsample_pred))

    return {'insample_rmse': insample_mse, 'outsample_rmse': outsample_mse, 'insample_r2': insample_r2, 'outsample_r2': outsample_r2, 'insample_msle': insample_msle, 'outsample_msle': outsample_msle}

## domain_post_metrics

In [6]:
success_metrics = get_classification_metrics(y_train.success, insample_success_pred, y_test.success, outsample_success_pred)
success_metrics = pd.DataFrame(success_metrics, index =['Success'])

cols = success_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

success_metrics = success_metrics.T
success_metrics['metric'] = metrics
success_metrics['insample_outsample'] = insample_outsample
success_metrics = success_metrics.groupby(by=['insample_outsample', 'metric']).sum()
success_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Success
insample_outsample,metric,Unnamed: 2_level_1
insample,accuracy,0.733906
insample,f1,0.826816
insample,precision,0.95316
insample,recall,0.704762
outsample,accuracy,0.710843
outsample,f1,0.815385
outsample,precision,0.963636
outsample,recall,0.706667


In [7]:
backers_metrics = get_regression_metrices(y_train.num_backers, insample_backers_pred, y_test.num_backers, outsample_backers_pred)

backers_metrics = pd.DataFrame(backers_metrics, index =['domain_post'])
cols = backers_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

backers_metrics = backers_metrics.T
backers_metrics['metric'] = metrics
backers_metrics['insample_outsample'] = insample_outsample

backers_metrics = backers_metrics.groupby(by=['insample_outsample', 'metric']).sum()
backers_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post
insample_outsample,metric,Unnamed: 2_level_1
insample,msle,2.361614
insample,r2,0.180732
insample,rmse,1922.74402
outsample,msle,1.665467
outsample,r2,0.318651
outsample,rmse,2650.458213


In [8]:
collection_metrics = get_regression_metrices(y_train.collection_ratio, insample_collection_pred, y_test.collection_ratio, outsample_collection_pred)

collection_metrics = pd.DataFrame(collection_metrics, index =['domain_post'])
cols = collection_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

collection_metrics = collection_metrics.T
collection_metrics['metric'] = metrics
collection_metrics['insample_outsample'] = insample_outsample

collection_metrics = collection_metrics.groupby(by=['insample_outsample', 'metric']).sum()
collection_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post
insample_outsample,metric,Unnamed: 2_level_1
insample,msle,0.207922
insample,r2,0.182593
insample,rmse,1.79507
outsample,msle,0.198917
outsample,r2,0.202141
outsample,rmse,1.663217


## domain_engagmeent_metrics

In [9]:
models, train_test = get_model_and_train_test(domain_engagement_data, domain_engagement_dir)
success_model, backers_model, collection_model = models
x_train, x_test, y_train, y_test = train_test

#Insample performance
insample_success_pred = success_model.predict(x_train)
insample_backers_pred = backers_model.predict(x_train)
insample_collection_pred = collection_model.predict(x_train)

outsample_success_pred = success_model.predict(x_test)
outsample_backers_pred = backers_model.predict(x_test)
outsample_collection_pred = collection_model.predict(x_test)

In [10]:
de_success_metrics = get_classification_metrics(y_train.success, insample_success_pred, y_test.success, outsample_success_pred)
de_success_metrics = pd.DataFrame(de_success_metrics, index =['domain_engagement'])

cols = de_success_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

de_success_metrics = de_success_metrics.T
de_success_metrics['metric'] = metrics
de_success_metrics['insample_outsample'] = insample_outsample
de_success_metrics = de_success_metrics.groupby(by=['insample_outsample', 'metric']).sum()

success_metrics = pd.concat([success_metrics, de_success_metrics], axis=1).rename(columns={'Success': 'domain_post'})
success_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post,domain_engagement
insample_outsample,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
insample,accuracy,0.733906,0.733906
insample,f1,0.826816,0.827298
insample,precision,0.95316,0.949804
insample,recall,0.704762,0.707143
outsample,accuracy,0.710843,0.722892
outsample,f1,0.815385,0.82963
outsample,precision,0.963636,0.933333
outsample,recall,0.706667,0.746667


In [11]:
de_backers_metrics = get_regression_metrices(y_train.num_backers, insample_backers_pred, y_test.num_backers, outsample_backers_pred)

de_backers_metrics = pd.DataFrame(de_backers_metrics, index =['domain_engagement'])
cols = de_backers_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

de_backers_metrics = de_backers_metrics.T
de_backers_metrics['metric'] = metrics
de_backers_metrics['insample_outsample'] = insample_outsample
de_backers_metrics = de_backers_metrics.groupby(by=['insample_outsample', 'metric']).sum()

backers_metrics = pd.concat([backers_metrics, de_backers_metrics], axis=1).rename(columns={'domain_post': 'domain_post'})
backers_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post,domain_engagement
insample_outsample,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
insample,msle,2.361614,1.518745
insample,r2,0.180732,0.338111
insample,rmse,1922.74402,1728.22854
outsample,msle,1.665467,1.562097
outsample,r2,0.318651,0.323353
outsample,rmse,2650.458213,2641.296384


In [12]:
de_collection_metrics = get_regression_metrices(y_train.collection_ratio, insample_collection_pred, y_test.collection_ratio, outsample_collection_pred)

de_collection_metrics = pd.DataFrame(de_collection_metrics, index =['domain_engagement'])
cols = de_collection_metrics.columns
metrics = [x.split('_')[1] for x in cols]
insample_outsample = [x.split('_')[0] for x in cols]

de_collection_metrics = de_collection_metrics.T
de_collection_metrics['metric'] = metrics
de_collection_metrics['insample_outsample'] = insample_outsample
de_collection_metrics = de_collection_metrics.groupby(by=['insample_outsample', 'metric']).sum()

collection_metrics = pd.concat([collection_metrics, de_collection_metrics], axis=1).rename(columns={'domain_post': 'domain_post'})
collection_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post,domain_engagement
insample_outsample,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
insample,msle,0.207922,0.231748
insample,r2,0.182593,0.067466
insample,rmse,1.79507,1.917321
outsample,msle,0.198917,0.205462
outsample,r2,0.202141,0.090565
outsample,rmse,1.663217,1.775708


In [13]:
success_metrics.to_csv('Results/success_metrics.csv')
backers_metrics.to_csv('Results/backers_metrics.csv')
collection_metrics.to_csv('Results/collection_metrics.csv')

In [31]:
def get_importance(model, test_inputs, test_outputs):
    importance = permutation_importance(
            model,
            test_inputs,
            test_outputs.values,
            n_repeats=25,
            random_state=42,
            n_jobs=-1
        )
    return importance

def get_importance_df(importance, columns):
    importance_df= pd.DataFrame(
            data=importance.importances_mean,
            index=columns,
            columns=['Importance']
        ).sort_values(by='Importance', ascending=False)
    
    # importance_df.Importance = (importance_df.Importance - importance_df.Importance.min())/ (importance_df.Importance.max() - importance_df.Importance.min())

    return importance_df

In [32]:
models, train_test = get_model_and_train_test(domain_engagement_data, domain_engagement_dir)
success_model, backers_model, collection_model = models
x_train, x_test, y_train, y_test = train_test

de_success_importance = get_importance(success_model, x_test.values, y_test.success)
de_backers_importance = get_importance(backers_model, x_test.values, y_test.num_backers)
de_collection_importance = get_importance(collection_model, x_test.values, y_test.collection_ratio)

de_success_importance_df = get_importance_df(de_success_importance, x_test.columns)
de_backers_importance_df = get_importance_df(de_backers_importance, x_test.columns)
de_collection_importance_df = get_importance_df(de_collection_importance, x_test.columns)

In [38]:
models, train_test = get_model_and_train_test(domain_post_data, domain_post_dir)
success_model, backers_model, collection_model = models
x_train, x_test, y_train, y_test = train_test

dp_success_importance = get_importance(success_model, x_test.values, y_test.success)
dp_backers_importance = get_importance(backers_model, x_test.values, y_test.num_backers)
dp_collection_importance = get_importance(collection_model, x_test.values, y_test.collection_ratio)

dp_success_importance_df = get_importance_df(dp_success_importance, x_test.columns)
dp_backers_importance_df = get_importance_df(dp_backers_importance, x_test.columns)
dp_collection_importance_df = get_importance_df(dp_collection_importance, x_test.columns)

In [40]:
dp_save_dir = 'Results/02 - Domain Post'
de_save_dir = 'Results/03 - Domain Engagement'

dp_success_importance_df.to_csv(f'{dp_save_dir}/success_importance.csv')
dp_backers_importance_df.to_csv(f'{dp_save_dir}/backers_importance.csv')
dp_collection_importance_df.to_csv(f'{dp_save_dir}/collection_importance.csv')

de_success_importance_df.to_csv(f'{de_save_dir}/success_importance.csv')
de_backers_importance_df.to_csv(f'{de_save_dir}/backers_importance.csv')
de_collection_importance_df.to_csv(f'{de_save_dir}/collection_importance.csv')

In [41]:
from Modelling.engagement_utils import *

engagement_model = joblib.load(f'Modelling/final_models/01_Enagement_prediction/HistGradientBoost.pkl')
post_data = pd.read_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/Original_dataset/processed_data.csv')
post_data.drop('original_index', axis=1, inplace=True)
features = [x for x in post_data.columns if x not in target_cols]

features = post_data[features]
targets = post_data[target_cols]

processed_features = pre_process(features)
processed_targets = process_targets(targets)
x_train, x_test, y_train, y_test = train_test_split(processed_features, processed_targets, test_size=0.2, random_state=42)

y_test = decode_targets(y_test)

In [42]:
engagement_importance = get_importance(engagement_model, x_test, y_test)
engagement_importance_df = get_importance_df(engagement_importance, x_test.columns)

In [43]:
engagement_importance_df.to_csv('Results/engagement_importance.csv')