In [1]:
import os
import warnings
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed 
from sklearn import ensemble, linear_model, svm
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from lazy_test import *
from success_utils import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Create a dictionary to group countries by region
region_groups = {
    "North America": ["HI", "Mexico", "NV", "OK", "NM","CA", "NY", "MD", "DC", "WA", "CO", "MN", "TX", "SC", "GA", "KY", "PA", "MI", "OH", "OR", "NC", "IL", "AZ", "TN", "NJ", "UT", "VA", "ND", "VT", "IA", "ND", "VT", "LA", "CT", "RI"],
    "Europe": ["Martinique", "AL", "Cayman Islands", "Russia", "ME", "UK", "DE", "Poland", "Denmark", "Netherlands", "Norway", "Liechtenstein", "Austria", "Belgium", "Czech Republic", "Iceland", "Ireland", "France", "Spain", "Sweden", "Switzerland", "Italy", "Slovenia", "Greece"],
    "Asia": ["MO", "IN", "Japan", "China", "Hong Kong", "India", "Viet Nam", "Taiwan", "Afghanistan", "Israel","Sri Lanka", "Jordan"],
    "South America": ["Colombia", "Brazil"],
    "Africa": ["WI", "SD", "MA", "Ghana", "Ethiopia", "Liberia", "Rwanda"],
    "Oceania": ["AU", "NZ"],
    "Other": [ "United Arab Emirates", "Svalbard and Jan Mayen"]
}

noticable_entities = ['ORG', 'PERSON', 'DATE',
                        'CARDINAL', 'GPE', 'PRODUCT', 
                        'WORK_OF_ART','ORDINAL', 'MONEY',
                        'TIME', 'NORP']


data_path = '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/Kickstarter_only/final_data.csv'
success_data = pd.read_csv(data_path)
success_data['collection_ratio'] = success_data.collected.apply(decode_amount)/success_data.goal_amount.apply(decode_amount)


bins = [0, 7, 14, 30, 60]
labels = ['<1 week', '1-2 weeks', '2 weeks - 1 month', '1-2 months']
success_data['duration'] = pd.cut(success_data['duration'], bins=bins, labels=labels)
success_data= success_data[success_data.donate != 1].reset_index(drop=True)


success_weights = (1-success_data.success.value_counts()/len(success_data)).values


def get_region(country):
    for region, countries in region_groups.items():
        if country in countries:
            return region
    return "Other"

success_data['region'] = success_data.country.str.strip().apply(get_region)
success_data.num_backers = success_data.num_backers.str.replace(',', '').astype(int)
success_data = success_data[success_data.region == 'North America']

success_data = pd.get_dummies(
                    success_data,
                    columns=['fb_type', 
                              'duration'])


for entity in noticable_entities:
    success_data[f'fb_entity_{entity}'] = success_data.fb_entities_identified.fillna('None').str.split(',').apply(lambda entity_list: entity in entity_list).astype(int)

bool_cols = success_data.dtypes[success_data.dtypes == 'bool'].index.tolist()
success_data[bool_cols] = success_data[bool_cols].astype(int)

#ecoding the number of backers
success_data.num_backers = encode_targets(success_data.num_backers)
success_data.population = np.exp(success_data.population)


success_data.head()

Unnamed: 0,collected,goal_amount,num_backers,category,pledge_types,donate,project_we_love,success,start_month,start_day,population,country,Person,Team,num_projects,num_backed,verified,fb_page_name,fb_likes_at_posting,fb_followers_at_posting,fb_likes,fb_comments,fb_shares,fb_post_views,fb_readability,fb_readability_lix,fb_entropy_scores,fb_perplexity_scores,fb_fear,fb_anger,fb_anticip,fb_trust,fb_surprise,fb_positive,fb_negative,fb_sadness,fb_disgust,fb_joy,fb_entities_identified,fb_post_sponsored,fb_post_age,fb_page_age,fb_positive_reactions,fb_negative_reactions,fb_topic_0,fb_topic_1,fb_topic_2,fb_topic_3,fb_topic_4,fb_topic_5,fb_topic_6,fb_topic_7,fb_topic_8,fb_topic_9,fb_topic_10,collection_ratio,region,fb_type_link,fb_type_photo,fb_type_video,duration_<1 week,duration_1-2 weeks,duration_2 weeks - 1 month,duration_1-2 months,fb_entity_ORG,fb_entity_PERSON,fb_entity_DATE,fb_entity_CARDINAL,fb_entity_GPE,fb_entity_PRODUCT,fb_entity_WORK_OF_ART,fb_entity_ORDINAL,fb_entity_MONEY,fb_entity_TIME,fb_entity_NORP
2,12.960635,10.819778,6.823286,design,4.0,0.0,1.0,1,6,1,684451.0,WA,0,1,7,16,Marc Barros,Kickstarter,1589901,1588019,10,2,1,12,4.836818,33.727273,0.719996,2.054425,2.4e-05,1.1e-05,3.7e-05,6.2e-05,9.3e-05,1.0,2.9e-05,2e-05,2.8e-05,6.2e-05,PRODUCT,0,2,4839,2,0,6.4e-05,3.6e-05,7.4e-05,0.242741,0.0001,5e-05,0.450027,0.075764,0.08507,3.8e-05,0.116083,8.50672,North America,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,11.866416,9.903488,6.954639,comics,8.0,0.0,1.0,1,6,1,105319.0,CA,1,0,3,88,Adam Lawson,Kickstarter,1589930,1588019,4,1,0,5,6.933333,29.111111,0.625279,1.868768,5.7e-05,1.8e-05,7e-05,9.8e-05,9.6e-05,3.2e-05,3.6e-05,1.3e-05,2e-05,7.4e-05,"CARDINAL,PERSON",0,8,4839,0,1,2.6e-05,6e-05,0.939369,8e-05,6.5e-05,3.2e-05,1.1e-05,5e-05,8.2e-05,9.6e-05,1.5e-05,7.12015,North America,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0
5,10.374272,10.308953,5.31812,film_Video,13.0,0.0,1.0,1,6,1,8175133.0,NY,1,0,1,2,Julia Greenberg,Kickstarter,1589715,1587803,3,1,0,4,15.9,61.666667,0.284582,1.329206,0.285714,4.7e-05,5e-05,0.0001,8.8e-05,0.285714,0.142857,9.6e-05,8.2e-05,0.142857,"PERSON,GPE",0,9,4839,0,1,1.9e-05,0.154496,8.4e-05,0.081168,7.9e-05,0.126198,5.2e-05,0.156902,0.294644,0.163853,9.5e-05,1.0675,North America,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0
6,9.542446,8.804775,5.556828,comics,16.0,0.0,1.0,1,6,1,3971883.0,CA,0,1,4,23,Donold Wooldridge,Kickstarter,1589715,1587803,3,5,1,5,14.2395,45.0,0.570874,1.769813,0.2,0.2,7.2e-05,2.8e-05,7.6e-05,4.9e-05,0.4,0.2,7.2e-05,3.6e-05,WORK_OF_ART,0,10,4839,1,1,4.3e-05,2.8e-05,0.162778,9.7e-05,4.5e-05,2.7e-05,0.059113,0.062181,0.06071,0.626484,6e-05,2.091059,North America,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
8,11.346304,9.21034,7.051856,games,27.0,0.0,1.0,1,6,4,621849.0,MD,1,0,5,21,Amanda Coronado,Kickstarter,1589543,1587618,9,3,1,11,3.083867,20.333333,0.466686,1.594701,3.6e-05,5.1e-05,6.8e-05,1e-05,8.3e-05,1.0,1.5e-05,3e-05,6.8e-05,6.7e-05,"PERSON,CARDINAL",0,14,4839,1,1,1.8e-05,1.8e-05,0.073925,0.798958,9.8e-05,4.1e-05,9.3e-05,4.6e-05,2.2e-05,4.2e-05,0.102862,8.4652,North America,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0


In [2]:
dropping_columns = ['collected', 'verified', 'donate', 'project_we_love', 'Team' ,'country', 'fb_page_name', 'region', 'fb_entities_identified', 'population', 'category']

success_data.drop(columns=dropping_columns, axis=1, inplace=True)
success_data = success_data[success_data.collection_ratio < 10].reset_index(drop=True)
success_data.pledge_types = success_data.pledge_types.astype(int)
success_data.num_projects = success_data.num_projects.astype(int)
success_data.num_backed = success_data.num_backed.astype(int)


In [3]:
#identify and list all the boolean features disguised as int in the dataset
bool_features = []
for col in success_data.columns:
    if success_data[col].nunique() == 2:
        bool_features.append(col)

bool_features.remove('success')

In [4]:
entity_features = [x for x in bool_features if 'fb_entity' in x]
duration_features = [x for x in bool_features if 'duration' in x]
region_features = [x for x in bool_features if 'region' in x]


In [5]:
# from sklearn.decomposition import PCA, TruncatedSVD
# def save_embedding_info(loadings, entity_features, name):
#     # Create a dataframe to store the loadings
#     loadings_df = pd.DataFrame(loadings, columns=entity_features)

#     plt.figure(figsize=(20, 10))
#     sns.heatmap(loadings_df, cmap='RdBu', center=0, annot=True)
#     save_path = '/workspaces/Crowdfunding-Social-Media-Drivers/Modelling/02_success_engagement/heatmaps for embeddings'
#     plt.savefig(save_path + f'/{name}.png', bbox_inches='tight')

# def try_svd(df, cols):
#     for i in range(1, len(cols)):
#         svd = TruncatedSVD(n_components=i)
#         svd.fit(df[cols])
#         evr = svd.explained_variance_ratio_.sum()
#         print(f'Explained variance ratio for {i} components: {svd.explained_variance_ratio_.sum()}')
#         if evr > 0.8:
#             return svd, i

# def append_embeddings(embeddings, df, col):
#     for i in range(embeddings.shape[1]):
#         df[f'{col}_embedding_{i}'] = embeddings[:,i]
#     return df

# svd, n_components = try_svd(success_data, entity_features)
# embddings = svd.fit_transform(success_data[entity_features])
# loadings = svd.components_
# save_embedding_info(loadings, entity_features, 'fb_entities')

# success_data = append_embeddings(embddings, success_data, 'fb_entities')

# svd, n_components = try_svd(success_data, region_features)
# embddings = svd.fit_transform(success_data[region_features])
# loadings = svd.components_
# save_embedding_info(loadings, region_features, 'region')

# success_data = append_embeddings(embddings, success_data, 'region')

# svd, n_components = try_svd(success_data, categories)
# embddings = svd.fit_transform(success_data[categories])
# loadings = svd.components_
# save_embedding_info(loadings, categories, 'categories')

# success_data = append_embeddings(embddings, success_data, 'categories')

# success_data.drop(columns=entity_features + region_features + categories, axis=1, inplace=True)

In [6]:
success_data.num_backers = np.exp(success_data.num_backers)
target_cols = [
    'success',
    'collection_ratio',
    'num_backers'
]
emotion_cols = ['fb_fear','fb_anger', 'fb_anticip', 'fb_trust', 'fb_surprise', 'fb_positive', 'fb_negative', 'fb_sadness', 'fb_disgust', 'fb_joy']
success_data['fb_emotionality'] = success_data[emotion_cols].sum(axis=1)

fb_cols = success_data.columns[success_data.columns.str.startswith('fb')].tolist()
common_features= success_data.columns[~success_data.columns.str.startswith('fb')].tolist()
common_features = [feature for feature in common_features if feature not in target_cols]
common_features.remove("pledge_types")

engagement_features = [ 'fb_likes', 'fb_comments', 'fb_shares','fb_positive_reactions','fb_negative_reactions']
post_fb_features = [feature for feature in fb_cols if feature not in engagement_features]


In [7]:
domain_page_features = success_data[common_features]
domain_engagement_features = success_data[common_features + engagement_features]
domain_post_features = success_data[common_features + post_fb_features]

targets = success_data[target_cols]

In [8]:
pd.concat([domain_page_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_only.csv', index=False)
pd.concat([domain_engagement_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_engagement.csv', index=False)
pd.concat([domain_post_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_post.csv', index=False)

In [9]:
print(common_features)

['goal_amount', 'start_month', 'start_day', 'Person', 'num_projects', 'num_backed', 'duration_<1 week', 'duration_1-2 weeks', 'duration_2 weeks - 1 month', 'duration_1-2 months']


## Modelling

In [10]:
working_data = 'domain_engagement'
domain_post = pd.read_csv(f'/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/{working_data}.csv')

In [11]:
domain_post.loc[:,target_cols].head()

Unnamed: 0,success,collection_ratio,num_backers
0,1,8.50672,919.0
1,1,7.12015,1048.0
2,1,1.0675,204.0
3,1,2.091059,259.0
4,1,8.4652,1155.0


In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(domain_post.drop(target_cols, axis=1), domain_post[target_cols], test_size=0.15, random_state=42, stratify=domain_post.success)

X_train = X_train.reset_index(drop=True).values
X_test = X_test.reset_index(drop=True).values
y_train = y_train.reset_index(drop=True).values
y_test = y_test.reset_index(drop=True).values

* y[:, 0] => Success (Boolean)
* y[:,1] => Collection Ratio
* y[:,2] => Number of Backers


In [43]:
reg_test = RegressionModels(X_train, X_test, y_train[:,1], y_test[:,1], decode=False)
reg_test.run_evaluation()
print(reg_test.not_trained_)
print(reg_test.not_evaluated_)
reg_test.all_scores.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/03 - Domain Engagement/collection_ratio/lazy_scores.csv")
reg_test.top_10_socres

Training models: 100%|██████████| 40/40 [00:01<00:00, 23.72it/s] 
Evaluating models: 100%|██████████| 40/40 [00:00<00:00, 119.09it/s]


All models were trained successfully.
All models were evaluated successfully.


Unnamed: 0,Insample RMSE,Insample MAE,Insample R2,Insample EV Score,Outsample RMSE,Outsample MAE,Outsample R2,Outsample EV Score
RandomForestRegressor,0.772,0.53,0.849,0.849,1.732,1.258,0.135,0.139
ExtraTreesRegressor,0.0,0.0,1.0,1.0,1.774,1.318,0.092,0.099
Lasso,1.92,1.301,0.065,0.065,1.776,1.267,0.09,0.094
LassoLars,1.92,1.301,0.065,0.065,1.776,1.267,0.09,0.094
ElasticNet,1.907,1.285,0.078,0.078,1.78,1.257,0.086,0.089
BayesianRidge,1.899,1.275,0.085,0.085,1.787,1.26,0.079,0.083
OrthogonalMatchingPursuitCV,1.936,1.317,0.049,0.049,1.788,1.282,0.078,0.082
OrthogonalMatchingPursuit,1.936,1.317,0.049,0.049,1.788,1.282,0.078,0.082
GradientBoostingRegressor,1.274,0.831,0.588,0.588,1.79,1.224,0.076,0.076
ElasticNetCV,1.883,1.255,0.101,0.101,1.792,1.264,0.074,0.076


In [44]:
reg_test = RegressionModels(X_train, X_test, y_train[:,2], y_test[:,2], decode=False)
reg_test.run_evaluation()
print(reg_test.not_trained_)
print(reg_test.not_evaluated_)
reg_test.all_scores.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/03 - Domain Engagement/num_backers/lazy_scores.csv")
reg_test.top_10_socres

Training models: 100%|██████████| 40/40 [00:01<00:00, 20.11it/s]
Evaluating models: 100%|██████████| 40/40 [00:00<00:00, 121.96it/s]

All models were trained successfully.
All models were evaluated successfully.





Unnamed: 0,Insample RMSE,Insample MAE,Insample R2,Insample EV Score,Outsample RMSE,Outsample MAE,Outsample R2,Outsample EV Score
HistGradientBoostingRegressor,1378.422,407.524,0.579,0.579,2342.294,892.457,0.468,0.479
RandomForestRegressor,655.778,265.064,0.905,0.905,2463.053,926.367,0.412,0.412
GradientBoostingRegressor,608.588,354.346,0.918,0.918,2595.596,916.067,0.347,0.349
ExtraTreesRegressor,0.0,0.0,1.0,1.0,2631.128,934.978,0.329,0.33
PoissonRegressor,1894.158,793.913,0.205,0.206,2674.177,1067.603,0.306,0.321
Lars,1904.874,904.566,0.196,0.196,2681.04,1099.324,0.303,0.319
BaggingRegressor,767.005,300.034,0.87,0.87,2700.874,1006.699,0.292,0.294
LinearRegression,1901.557,857.852,0.199,0.199,2717.789,1071.916,0.284,0.3
Ridge,1901.564,857.106,0.199,0.199,2718.484,1071.682,0.283,0.3
LassoLars,1901.6,857.348,0.199,0.199,2718.604,1072.704,0.283,0.3


In [13]:
cls_test = ClassificationModels(X_train, X_test, y_train[:,0], y_test[:,0])
cls_test.run_evaluation()
print(cls_test.not_trained_)
print(cls_test.not_evaluated_)
cls_test.all_scores.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/03 - Domain Engagement/success/lazy_scores.csv")
cls_test.top_10_socres

Training models: 100%|██████████| 28/28 [00:01<00:00, 20.18it/s]
Evaluating models: 100%|██████████| 27/27 [00:00<00:00, 67.16it/s] 

The modelsNu-Support Vector Machine could not be trained.
The modelsKernel Ridge could not be evaluated.





Unnamed: 0,Insample_Accuracy,Outsample_Accuracy,Insample_Precision,Outsample_Precision,Insample_Recall,Outsample_Recall,Insample_F1 Score,Outsample_F1 Score,Insample_ROC AUC,Outsample_ROC AUC,Insample_Class-0 Acc,Insample_Class-1 Acc,Outsample_Class-0 Acc,Outsample_Class-1 Acc
Gaussian Naive Bayes,0.324034,0.301205,0.981651,1.0,0.254762,0.226667,0.404537,0.369565,0.605642,0.613333,0.956522,0.254762,1.0,0.226667
Easy Ensemble Classifier,0.652361,0.554217,1.0,0.931818,0.614286,0.546667,0.761062,0.689076,0.807143,0.585833,1.0,0.614286,0.625,0.546667
Multinomial Naive Bayes,0.476395,0.554217,0.953608,0.931818,0.440476,0.546667,0.602606,0.689076,0.622412,0.585833,0.804348,0.440476,0.625,0.546667
Balanced Random Forest Classifier,0.712446,0.698795,1.0,0.931034,0.680952,0.72,0.810198,0.81203,0.840476,0.61,1.0,0.680952,0.5,0.72
Balanced Bagging Classifier,0.755365,0.650602,0.996753,0.910714,0.730952,0.68,0.843407,0.778626,0.854607,0.5275,0.978261,0.730952,0.375,0.68
RUSBoost Classifier,0.7897,0.614458,0.949721,0.890909,0.809524,0.653333,0.874036,0.753846,0.70911,0.451667,0.608696,0.809524,0.25,0.653333
Gaussian Process,1.0,0.795181,1.0,0.902778,1.0,0.866667,1.0,0.884354,1.0,0.495833,1.0,1.0,0.125,0.866667
Extra Trees,1.0,0.903614,1.0,0.903614,1.0,1.0,1.0,0.949367,1.0,0.5,1.0,1.0,0.0,1.0
Random Forest,1.0,0.903614,1.0,0.903614,1.0,1.0,1.0,0.949367,1.0,0.5,1.0,1.0,0.0,1.0
Gradient Boosting,0.963519,0.903614,0.961098,0.903614,1.0,1.0,0.980163,0.949367,0.815217,0.5,0.630435,1.0,0.0,1.0


#### Classification Notes
> For the success model, based on the overall f1 scores, Balance RandomForest clssifier, Balanced Bagging Classifier are more suitable

## Further Finetuning of the topk models

### Collection Ratio

In [16]:
lars = linear_model.Lars(random_state=42)
lars.fit(X_train, y_train[:,1])

in_y_pred = lars.predict(X_train)
out_y_pred = lars.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')


In-sample RMSE, R2: 1.87, 0.11035640104328315
Out-sample RMSE, R2: 1.84, 0.028657842877498063


In [17]:
lassolars = linear_model.LassoLars(random_state=42)
lassolars.fit(X_train, y_train[:,1])

in_y_pred = lassolars.predict(X_train)
out_y_pred = lassolars.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.92, 0.06516032447484565
Out-sample RMSE, R2: 1.78, 0.09002356770127506


In [18]:
lassolars_cv = linear_model.LassoLarsCV(cv=50)
lassolars_cv.fit(X_train, y_train[:,1])

in_y_pred = lassolars_cv.predict(X_train)
out_y_pred = lassolars_cv.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.88, 0.10184790077617456
Out-sample RMSE, R2: 1.79, 0.07207982444879457


In [19]:
lasso = linear_model.Lasso(random_state=42)
lasso.fit(X_train, y_train[:,1])

in_y_pred = lasso.predict(X_train)
out_y_pred = lasso.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.92, 0.06516037252611695
Out-sample RMSE, R2: 1.78, 0.09002374086652698


In [20]:
joblib.dump(lasso, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/collection_ratio.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/collection_ratio.pkl']

### Number of Backers

In [21]:
hist_grad_boost = ensemble.HistGradientBoostingRegressor(random_state=42)
hist_grad_boost.fit(X_train, y_train[:,2])

in_y_pred = hist_grad_boost.predict(X_train)
out_y_pred = hist_grad_boost.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,2], in_y_pred)):.2f}, {r2_score(y_train[:,2], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,2], out_y_pred)):.2f}, {r2_score(y_test[:,2], out_y_pred)}')

In-sample RMSE, R2: 1378.42, 0.5789368351055881
Out-sample RMSE, R2: 2342.29, 0.4678789564481154


In [22]:
poisson = linear_model.PoissonRegressor()
poisson.fit(X_train, y_train[:,2])

in_y_pred = poisson.predict(X_train)
out_y_pred = poisson.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,2], in_y_pred)):.2f}, {r2_score(y_train[:,2], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,2], out_y_pred)):.2f}, {r2_score(y_test[:,2], out_y_pred)}')

In-sample RMSE, R2: 1894.16, 0.20491147485641825
Out-sample RMSE, R2: 2674.18, 0.30640154548056087


In [23]:
joblib.dump(hist_grad_boost, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/num_backers.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/num_backers.pkl']

### Success

In [25]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_train, y_train[:,0])

y_pred_in = easy_ensemble.predict(X_train)
y_pred_out = easy_ensemble.predict(X_test)

def get_class_wise_acc(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0,0]/cm[0].sum(), cm[1,1]/cm[1].sum()

in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)

print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')

print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')

print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')
print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')

Overall In-sample Accuracy: 0.6523605150214592
Overall Out-sample Accuracy: 0.5542168674698795 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.625 

In-sample Class 1 Accuracy: 0.6142857142857143
Out-sample Class 1 Accuracy: 0.5466666666666666


In [27]:
n_estimator_list = [75, 100, 500, 1000, 2000]

for n_estimators in n_estimator_list:
    balanced_rf = BalancedRandomForestClassifier(random_state=42, n_estimators=n_estimators)
    balanced_rf.fit(X_train, y_train[:,0])
    
    y_pred_in = balanced_rf.predict(X_train)
    y_pred_out = balanced_rf.predict(X_test)
    
    
    in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
    out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)
    
    print(f'Number of Estimators: {n_estimators}')
    print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
    print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')
    
    print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
    print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')
    
    print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')
    print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')

    print('-'*10)

Number of Estimators: 75
Overall In-sample Accuracy: 0.7124463519313304
Overall Out-sample Accuracy: 0.6987951807228916 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.5 

In-sample Class 1 Accuracy: 0.680952380952381
Out-sample Class 1 Accuracy: 0.72
----------
Number of Estimators: 100
Overall In-sample Accuracy: 0.7124463519313304
Overall Out-sample Accuracy: 0.6987951807228916 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.5 

In-sample Class 1 Accuracy: 0.680952380952381
Out-sample Class 1 Accuracy: 0.72
----------
Number of Estimators: 500
Overall In-sample Accuracy: 0.7060085836909872
Overall Out-sample Accuracy: 0.6746987951807228 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.625 

In-sample Class 1 Accuracy: 0.6738095238095239
Out-sample Class 1 Accuracy: 0.68
----------
Number of Estimators: 1000
Overall In-sample Accuracy: 0.6995708154506438
Overall Out-sample Accuracy: 0.6626506024096386 

In-sample Class 0 Accuracy: 1.0

In [28]:
rf_500 = BalancedRandomForestClassifier(random_state=42, n_estimators=500)
rf_500.fit(X_train, y_train[:,0])

y_pred_in = rf_500.predict(X_train)
y_pred_out = rf_500.predict(X_test)

in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)

print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')

print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')

print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')
print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')

Overall In-sample Accuracy: 0.7060085836909872
Overall Out-sample Accuracy: 0.6746987951807228 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.625 

In-sample Class 1 Accuracy: 0.6738095238095239
Out-sample Class 1 Accuracy: 0.68


In [29]:
joblib.dump(rf_500, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/success.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/03_domain_engagement/success.pkl']