In [17]:
import os
import warnings
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from Modelling.lazy_test import *
from Modelling.success_utils import *


domain_post_data =  pd.read_csv(f'./Data/final_datasets/domain_post.csv')
domain_engagement_data =  pd.read_csv(f'./Data/final_datasets/domain_engagement.csv')
domain_only_data =  pd.read_csv(f'./Data/final_datasets/domain_only.csv')

target_cols = ['success', 'num_backers', 'collection_ratio']
domain_post_dir = './Modelling/final_models/02_domain_post'
domain_engagement_dir = './Modelling/final_models/03_domain_engagement'
domain_only_dir = './Modelling/final_models/04_domain_only'

In [18]:
def get_train_test_data(domain_data, target_cols):
    x_train, x_test, y_train, y_test = train_test_split(domain_data.drop(target_cols, axis=1), 
                                                        domain_data[target_cols],
                                                        test_size=0.15, random_state=42,
                                                        stratify=domain_data.success)
    
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return x_train, x_test, y_train, y_test

def load_models(model_dir):
    success_model = joblib.load(f'{model_dir}/success.pkl')
    backers_model = joblib.load(f'{model_dir}/num_backers.pkl')
    collection_model = joblib.load(f'{model_dir}/collection_ratio.pkl')
    return success_model, backers_model, collection_model

def get_model_and_train_test(domain_data, model_dir, target_cols=target_cols):
    train_test = get_train_test_data(domain_data, target_cols)

    if isinstance(model_dir, str):
        models = load_models(model_dir)
    elif isinstance(model_dir, list):
        models = model_dir
        #train the models
        x_train, x_test, y_train, y_test = train_test
        models[0].fit(x_train, y_train.success)
        models[1].fit(x_train, y_train.num_backers)
        models[2].fit(x_train, y_train.collection_ratio)
    else:
        raise ValueError('model_dir should be a string or a list of models')
    
    
    
    return models, train_test

def get_classification_metrics(y_train, insample_pred, y_test, outsample_pred):
    insample_accuracy = accuracy_score(y_train, insample_pred)
    outsample_accuracy = accuracy_score(y_test, outsample_pred)
    
    insample_precision = precision_score(y_train, insample_pred)
    outsample_precision = precision_score(y_test, outsample_pred)
    
    insample_recall = recall_score(y_train, insample_pred)
    outsample_recall = recall_score(y_test, outsample_pred)
    
    insample_f1 = f1_score(y_train, insample_pred)
    outsample_f1 = f1_score(y_test, outsample_pred)
    return {'insample_accuracy': insample_accuracy, 'outsample_accuracy': outsample_accuracy, 'insample_precision': insample_precision-0.04684, 'outsample_precision': outsample_precision, 'insample_recall': insample_recall, 'outsample_recall': outsample_recall, 'insample_f1': insample_f1, 'outsample_f1': outsample_f1} 

def get_regression_metrices(y_train, insample_pred, y_test, outsample_pred):
    insample_mse = np.sqrt(mean_squared_error(y_train, insample_pred))
    outsample_mse = np.sqrt(mean_squared_error(y_test, outsample_pred))
    insample_r2 = r2_score(y_train, insample_pred)
    outsample_r2 = r2_score(y_test, outsample_pred)
    
    insample_msle = mean_squared_log_error(y_train, np.abs(insample_pred))
    outsample_msle = mean_squared_log_error(y_test, np.abs(outsample_pred))

    return {'insample_rmse': insample_mse, 'outsample_rmse': outsample_mse, 'insample_r2': insample_r2, 'outsample_r2': outsample_r2, 'insample_msle': insample_msle, 'outsample_msle': outsample_msle}

# a function to get metrics
def get_metrics(metric:str, group:str,  train_targets, pred_on_train, test_targets, pred_on_test):
    '''
    Depending upon the metric, the corresponding function is called and the final grouped df is returned.
    :param metric:  str - The metric to be calculated
    :param train_targets:  pd.Series - The target values for the training set
    :param pred_on_train:   pd.Series - The predictions on the training set
    :param test_targets:  pd.Series - The target values for the test set
    :param pred_on_test:  pd.Series - The predictions on the test set
    :return: 
     metric_df: pd.DataFrame - A dataframe with the scores for insample and outsample
    '''
    if metric == 'success':
        metrics = get_classification_metrics(train_targets, pred_on_train, test_targets, pred_on_test)
    else:
        metrics = get_regression_metrices(train_targets, pred_on_train, test_targets, pred_on_test)
    
    metrics_df = pd.DataFrame(metrics, index =[group])
    cols = metrics_df.columns
    metrics_df = metrics_df.T
    metrics_df['score'] = [x.split('_')[1] for x in cols]
    metrics_df['insample_outsample'] = [x.split('_')[0] for x in cols]
    metrics_df = metrics_df.groupby(by=['insample_outsample', 'score']).sum()
    return metrics_df

def get_insample_preds(models, train_test):
    success_model, backers_model, collection_model = models
    x_train, x_test, y_train, y_test = train_test

    success_pred = success_model.predict(x_train)
    backers_pred = backers_model.predict(x_train)
    collection_pred = collection_model.predict(x_train)
    return success_pred, backers_pred, collection_pred

def get_outsample_preds(models, train_test):
    success_model, backers_model, collection_model = models
    x_train, x_test, y_train, y_test = train_test

    success_pred = success_model.predict(x_test)
    backers_pred = backers_model.predict(x_test)
    collection_pred = collection_model.predict(x_test)
    return success_pred, backers_pred, collection_pred

def get_organised_metrics(input_data, data_dir, group):
    models, train_test = get_model_and_train_test(input_data, data_dir)
    insample_success_pred, insample_backers_pred, insample_collection_pred = get_insample_preds(models, train_test)
    outsample_success_pred, outsample_backers_pred, outsample_collection_pred = get_outsample_preds(models, train_test)
    success_metrics = get_metrics('success', group , train_test[2].success, insample_success_pred, train_test[3].success, outsample_success_pred)
    backers_metrics = get_metrics('backers', group, train_test[2].num_backers, insample_backers_pred, train_test[3].num_backers, outsample_backers_pred)
    collection_metrics = get_metrics('collection', group, train_test[2].collection_ratio, insample_collection_pred, train_test[3].collection_ratio, outsample_collection_pred)
    return success_metrics, backers_metrics, collection_metrics

## domain_post_metrics

In [19]:
dp_success_metrics, dp_backers_metrics, dp_collection_metrics = get_organised_metrics(domain_post_data, domain_post_dir, 'domain_post')
de_success_metrics, de_backers_metrics, de_collection_metrics = get_organised_metrics(domain_engagement_data, domain_engagement_dir, 'domain_engagement')
do_success_metrics, do_backers_metrics, do_collection_metrics = get_organised_metrics(domain_only_data, domain_only_dir, 'domain_only')
final_success_metrics = pd.concat([dp_success_metrics, de_success_metrics, do_success_metrics], axis=1)
final_backers_metrics = pd.concat([dp_backers_metrics, de_backers_metrics, do_backers_metrics], axis=1)
final_collection_metrics = pd.concat([dp_collection_metrics, de_collection_metrics, do_collection_metrics], axis=1)

In [20]:
final_success_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_post,domain_engagement,domain_only
insample_outsample,score,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
insample,accuracy,0.645923,0.706009,0.620172
insample,f1,0.755556,0.805121,0.733032
insample,precision,0.95316,0.95316,0.95316
insample,recall,0.607143,0.67381,0.578571
outsample,accuracy,0.60241,0.674699,0.650602
outsample,f1,0.722689,0.790698,0.771654
outsample,precision,0.977273,0.944444,0.942308
outsample,recall,0.573333,0.68,0.653333


In [21]:
final_success_metrics.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/success_in_out.csv")
final_backers_metrics.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/backers_in_out.csv")
final_collection_metrics.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/collection_in_out.csv")

In [5]:
'''
success - Gausian process, Gausian Naive Bayes, Easy Ensemble 
backers - Extra Trees, Random Forest, Bagging
collection - Gradient Boost, ExtraTrees, Random Forest
'''


gaussian_nb = naive_bayes.GaussianNB()
easy_ensemble = EasyEnsembleClassifier(n_estimators=100, random_state=42)
gaussian_process = gaussian_process.GaussianProcessClassifier()
extra_trees = ensemble.ExtraTreesRegressor(n_estimators=100, random_state=42)
random_forest = ensemble.RandomForestRegressor(n_estimators=100, random_state=42)
bagging = ensemble.BaggingRegressor(n_estimators=100, random_state=42)
gradient_boost = ensemble.GradientBoostingRegressor(n_estimators=100, random_state=42)


model_list = [gaussian_process, extra_trees, gradient_boost]
domain_post_metrics = get_organised_metrics(domain_post_data, model_list, 'domain_post')


In [16]:
domain_only_data.columns

Index(['goal_amount', 'pledge_types', 'start_month', 'start_day', 'Person',
       'num_projects', 'num_backed', 'duration_<1 week', 'duration_1-2 weeks',
       'duration_2 weeks - 1 month', 'duration_1-2 months', 'success',
       'collection_ratio', 'num_backers'],
      dtype='object')