In [17]:
import os
import warnings
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed 
from sklearn import ensemble, linear_model, svm
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from lazy_test import *
from success_utils import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Create a dictionary to group countries by region
region_groups = {
    "North America": ["HI", "Mexico", "NV", "OK", "NM","CA", "NY", "MD", "DC", "WA", "CO", "MN", "TX", "SC", "GA", "KY", "PA", "MI", "OH", "OR", "NC", "IL", "AZ", "TN", "NJ", "UT", "VA", "ND", "VT", "IA", "ND", "VT", "LA", "CT", "RI"],
    "Europe": ["Martinique", "AL", "Cayman Islands", "Russia", "ME", "UK", "DE", "Poland", "Denmark", "Netherlands", "Norway", "Liechtenstein", "Austria", "Belgium", "Czech Republic", "Iceland", "Ireland", "France", "Spain", "Sweden", "Switzerland", "Italy", "Slovenia", "Greece"],
    "Asia": ["MO", "IN", "Japan", "China", "Hong Kong", "India", "Viet Nam", "Taiwan", "Afghanistan", "Israel","Sri Lanka", "Jordan"],
    "South America": ["Colombia", "Brazil"],
    "Africa": ["WI", "SD", "MA", "Ghana", "Ethiopia", "Liberia", "Rwanda"],
    "Oceania": ["AU", "NZ"],
    "Other": [ "United Arab Emirates", "Svalbard and Jan Mayen"]
}

noticable_entities = ['ORG', 'PERSON', 'DATE',
                        'CARDINAL', 'GPE', 'PRODUCT', 
                        'WORK_OF_ART','ORDINAL', 'MONEY',
                        'TIME', 'NORP']


data_path = '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/Kickstarter_only/final_data.csv'
success_data = pd.read_csv(data_path)
success_data['collection_ratio'] = success_data.collected.apply(decode_amount)/success_data.goal_amount.apply(decode_amount)


bins = [0, 7, 14, 30, 60]
labels = ['<1 week', '1-2 weeks', '2 weeks - 1 month', '1-2 months']
success_data['duration'] = pd.cut(success_data['duration'], bins=bins, labels=labels)
success_data= success_data[success_data.donate != 1].reset_index(drop=True)


success_weights = (1-success_data.success.value_counts()/len(success_data)).values


def get_region(country):
    for region, countries in region_groups.items():
        if country in countries:
            return region
    return "Other"

success_data['region'] = success_data.country.str.strip().apply(get_region)
success_data.num_backers = success_data.num_backers.str.replace(',', '').astype(int)
success_data = success_data[success_data.region == 'North America']

success_data = pd.get_dummies(
                    success_data,
                    columns=['fb_type', 
                              'duration'])


for entity in noticable_entities:
    success_data[f'fb_entity_{entity}'] = success_data.fb_entities_identified.fillna('None').str.split(',').apply(lambda entity_list: entity in entity_list).astype(int)

bool_cols = success_data.dtypes[success_data.dtypes == 'bool'].index.tolist()
success_data[bool_cols] = success_data[bool_cols].astype(int)

#ecoding the number of backers
success_data.num_backers = encode_targets(success_data.num_backers)
success_data.population = np.exp(success_data.population)


success_data.head()

Unnamed: 0,collected,goal_amount,num_backers,category,pledge_types,donate,project_we_love,success,start_month,start_day,population,country,Person,Team,num_projects,num_backed,verified,fb_page_name,fb_likes_at_posting,fb_followers_at_posting,fb_likes,fb_comments,fb_shares,fb_post_views,fb_readability,fb_readability_lix,fb_entropy_scores,fb_perplexity_scores,fb_fear,fb_anger,fb_anticip,fb_trust,fb_surprise,fb_positive,fb_negative,fb_sadness,fb_disgust,fb_joy,fb_entities_identified,fb_post_sponsored,fb_post_age,fb_page_age,fb_positive_reactions,fb_negative_reactions,fb_topic_0,fb_topic_1,fb_topic_2,fb_topic_3,fb_topic_4,fb_topic_5,fb_topic_6,fb_topic_7,fb_topic_8,fb_topic_9,fb_topic_10,collection_ratio,region,fb_type_link,fb_type_photo,fb_type_video,duration_<1 week,duration_1-2 weeks,duration_2 weeks - 1 month,duration_1-2 months,fb_entity_ORG,fb_entity_PERSON,fb_entity_DATE,fb_entity_CARDINAL,fb_entity_GPE,fb_entity_PRODUCT,fb_entity_WORK_OF_ART,fb_entity_ORDINAL,fb_entity_MONEY,fb_entity_TIME,fb_entity_NORP
2,12.960635,10.819778,6.823286,design,4.0,0.0,1.0,1,6,1,684451.0,WA,0,1,7,16,Marc Barros,Kickstarter,1589901,1588019,10,2,1,12,4.836818,33.727273,0.719996,2.054425,2.4e-05,1.1e-05,3.7e-05,6.2e-05,9.3e-05,1.0,2.9e-05,2e-05,2.8e-05,6.2e-05,PRODUCT,0,2,4839,2,0,6.4e-05,3.6e-05,7.4e-05,0.242741,0.0001,5e-05,0.450027,0.075764,0.08507,3.8e-05,0.116083,8.50672,North America,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,11.866416,9.903488,6.954639,comics,8.0,0.0,1.0,1,6,1,105319.0,CA,1,0,3,88,Adam Lawson,Kickstarter,1589930,1588019,4,1,0,5,6.933333,29.111111,0.625279,1.868768,5.7e-05,1.8e-05,7e-05,9.8e-05,9.6e-05,3.2e-05,3.6e-05,1.3e-05,2e-05,7.4e-05,"CARDINAL,PERSON",0,8,4839,0,1,2.6e-05,6e-05,0.939369,8e-05,6.5e-05,3.2e-05,1.1e-05,5e-05,8.2e-05,9.6e-05,1.5e-05,7.12015,North America,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0
5,10.374272,10.308953,5.31812,film_Video,13.0,0.0,1.0,1,6,1,8175133.0,NY,1,0,1,2,Julia Greenberg,Kickstarter,1589715,1587803,3,1,0,4,15.9,61.666667,0.284582,1.329206,0.285714,4.7e-05,5e-05,0.0001,8.8e-05,0.285714,0.142857,9.6e-05,8.2e-05,0.142857,"PERSON,GPE",0,9,4839,0,1,1.9e-05,0.154496,8.4e-05,0.081168,7.9e-05,0.126198,5.2e-05,0.156902,0.294644,0.163853,9.5e-05,1.0675,North America,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0
6,9.542446,8.804775,5.556828,comics,16.0,0.0,1.0,1,6,1,3971883.0,CA,0,1,4,23,Donold Wooldridge,Kickstarter,1589715,1587803,3,5,1,5,14.2395,45.0,0.570874,1.769813,0.2,0.2,7.2e-05,2.8e-05,7.6e-05,4.9e-05,0.4,0.2,7.2e-05,3.6e-05,WORK_OF_ART,0,10,4839,1,1,4.3e-05,2.8e-05,0.162778,9.7e-05,4.5e-05,2.7e-05,0.059113,0.062181,0.06071,0.626484,6e-05,2.091059,North America,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
8,11.346304,9.21034,7.051856,games,27.0,0.0,1.0,1,6,4,621849.0,MD,1,0,5,21,Amanda Coronado,Kickstarter,1589543,1587618,9,3,1,11,3.083867,20.333333,0.466686,1.594701,3.6e-05,5.1e-05,6.8e-05,1e-05,8.3e-05,1.0,1.5e-05,3e-05,6.8e-05,6.7e-05,"PERSON,CARDINAL",0,14,4839,1,1,1.8e-05,1.8e-05,0.073925,0.798958,9.8e-05,4.1e-05,9.3e-05,4.6e-05,2.2e-05,4.2e-05,0.102862,8.4652,North America,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0


In [18]:
dropping_columns = ['collected', 'verified', 'donate', 'project_we_love', 'Team' ,'country', 'fb_page_name', 'region', 'fb_entities_identified', 'population', 'category']

success_data.drop(columns=dropping_columns, axis=1, inplace=True)
success_data = success_data[success_data.collection_ratio < 10].reset_index(drop=True)
success_data.pledge_types = success_data.pledge_types.astype(int)
success_data.num_projects = success_data.num_projects.astype(int)
success_data.num_backed = success_data.num_backed.astype(int)


In [19]:
#identify and list all the boolean features disguised as int in the dataset
bool_features = []
for col in success_data.columns:
    if success_data[col].nunique() == 2:
        bool_features.append(col)

bool_features.remove('success')

In [20]:
entity_features = [x for x in bool_features if 'fb_entity' in x]
duration_features = [x for x in bool_features if 'duration' in x]
region_features = [x for x in bool_features if 'region' in x]


In [21]:
# from sklearn.decomposition import PCA, TruncatedSVD
# def save_embedding_info(loadings, entity_features, name):
#     # Create a dataframe to store the loadings
#     loadings_df = pd.DataFrame(loadings, columns=entity_features)

#     plt.figure(figsize=(20, 10))
#     sns.heatmap(loadings_df, cmap='RdBu', center=0, annot=True)
#     save_path = '/workspaces/Crowdfunding-Social-Media-Drivers/Modelling/02_success_engagement/heatmaps for embeddings'
#     plt.savefig(save_path + f'/{name}.png', bbox_inches='tight')

# def try_svd(df, cols):
#     for i in range(1, len(cols)):
#         svd = TruncatedSVD(n_components=i)
#         svd.fit(df[cols])
#         evr = svd.explained_variance_ratio_.sum()
#         print(f'Explained variance ratio for {i} components: {svd.explained_variance_ratio_.sum()}')
#         if evr > 0.8:
#             return svd, i

# def append_embeddings(embeddings, df, col):
#     for i in range(embeddings.shape[1]):
#         df[f'{col}_embedding_{i}'] = embeddings[:,i]
#     return df

# svd, n_components = try_svd(success_data, entity_features)
# embddings = svd.fit_transform(success_data[entity_features])
# loadings = svd.components_
# save_embedding_info(loadings, entity_features, 'fb_entities')

# success_data = append_embeddings(embddings, success_data, 'fb_entities')

# svd, n_components = try_svd(success_data, region_features)
# embddings = svd.fit_transform(success_data[region_features])
# loadings = svd.components_
# save_embedding_info(loadings, region_features, 'region')

# success_data = append_embeddings(embddings, success_data, 'region')

# svd, n_components = try_svd(success_data, categories)
# embddings = svd.fit_transform(success_data[categories])
# loadings = svd.components_
# save_embedding_info(loadings, categories, 'categories')

# success_data = append_embeddings(embddings, success_data, 'categories')

# success_data.drop(columns=entity_features + region_features + categories, axis=1, inplace=True)

In [22]:
success_data.num_backers = np.exp(success_data.num_backers)
target_cols = [
    'success',
    'collection_ratio',
    'num_backers'
]
emotion_cols = ['fb_fear','fb_anger', 'fb_anticip', 'fb_trust', 'fb_surprise', 'fb_positive', 'fb_negative', 'fb_sadness', 'fb_disgust', 'fb_joy']
success_data['fb_emotionality'] = success_data[emotion_cols].sum(axis=1)

fb_cols = success_data.columns[success_data.columns.str.startswith('fb')].tolist()
common_features= success_data.columns[~success_data.columns.str.startswith('fb')].tolist()
common_features = [feature for feature in common_features if feature not in target_cols]

engagement_features = [ 'fb_likes', 'fb_comments', 'fb_shares','fb_positive_reactions','fb_negative_reactions']
post_fb_features = [feature for feature in fb_cols if feature not in engagement_features]


In [23]:
domain_page_features = success_data[common_features]
domain_engagement_features = success_data[common_features + engagement_features]
domain_post_features = success_data[common_features + post_fb_features]

targets = success_data[target_cols]

In [24]:
pd.concat([domain_page_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_only.csv', index=False)
pd.concat([domain_engagement_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_engagement.csv', index=False)
pd.concat([domain_post_features, targets], axis=1).to_csv('/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_post.csv', index=False)

In [25]:
print(common_features)

['goal_amount', 'pledge_types', 'start_month', 'start_day', 'Person', 'num_projects', 'num_backed', 'duration_<1 week', 'duration_1-2 weeks', 'duration_2 weeks - 1 month', 'duration_1-2 months']


## Modelling

In [26]:
working_data = 'domain_only'
domain_post = pd.read_csv(f'/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/final_datasets/{working_data}.csv')

In [27]:
domain_post.loc[:,target_cols].head()

Unnamed: 0,success,collection_ratio,num_backers
0,1,8.50672,919.0
1,1,7.12015,1048.0
2,1,1.0675,204.0
3,1,2.091059,259.0
4,1,8.4652,1155.0


In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(domain_post.drop(target_cols, axis=1), domain_post[target_cols], test_size=0.15, random_state=42, stratify=domain_post.success)

X_train = X_train.reset_index(drop=True).values
X_test = X_test.reset_index(drop=True).values
y_train = y_train.reset_index(drop=True).values
y_test = y_test.reset_index(drop=True).values

* y[:, 0] => Success (Boolean)
* y[:,1] => Collection Ratio
* y[:,2] => Number of Backers


In [29]:
reg_test = RegressionModels(X_train, X_test, y_train[:,1], y_test[:,1], decode=False)
reg_test.run_evaluation()
print(reg_test.not_trained_)
print(reg_test.not_evaluated_)
reg_test.top_10_socres.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/04 - Domain Only/collection_ratio/lazy_scores.csv")
reg_test.top_10_socres

Training models: 100%|██████████| 40/40 [00:01<00:00, 27.69it/s]
Evaluating models: 100%|██████████| 40/40 [00:00<00:00, 119.26it/s]

All models were trained successfully.
All models were evaluated successfully.





Unnamed: 0,Insample RMSE,Insample MAE,Insample R2,Insample EV Score,Outsample RMSE,Outsample MAE,Outsample R2,Outsample EV Score
GradientBoostingRegressor,1.318,0.844,0.559,0.559,1.748,1.234,0.119,0.125
RandomForestRegressor,0.751,0.511,0.857,0.858,1.76,1.237,0.107,0.108
ElasticNet,1.913,1.291,0.071,0.071,1.764,1.246,0.103,0.107
LarsCV,1.907,1.284,0.078,0.078,1.767,1.244,0.1,0.103
LassoLarsCV,1.907,1.284,0.078,0.078,1.767,1.244,0.1,0.103
BayesianRidge,1.907,1.285,0.077,0.077,1.767,1.247,0.099,0.103
LassoCV,1.907,1.283,0.078,0.078,1.767,1.244,0.099,0.102
ElasticNetCV,1.906,1.283,0.078,0.078,1.767,1.244,0.099,0.102
LassoLarsIC,1.901,1.277,0.083,0.083,1.77,1.251,0.096,0.099
Lasso,1.922,1.304,0.063,0.063,1.772,1.262,0.095,0.099


In [30]:
reg_test = RegressionModels(X_train, X_test, y_train[:,2], y_test[:,2], decode=False)
reg_test.run_evaluation()
print(reg_test.not_trained_)
print(reg_test.not_evaluated_)
reg_test.top_10_socres.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/04 - Domain Only/num_backers/lazy_scores.csv")
reg_test.top_10_socres

Training models: 100%|██████████| 40/40 [00:01<00:00, 30.40it/s]
Evaluating models: 100%|██████████| 40/40 [00:00<00:00, 154.60it/s]

All models were trained successfully.
All models were evaluated successfully.





Unnamed: 0,Insample RMSE,Insample MAE,Insample R2,Insample EV Score,Outsample RMSE,Outsample MAE,Outsample R2,Outsample EV Score
GradientBoostingRegressor,619.463,354.816,0.915,0.915,2219.286,920.429,0.522,0.526
PoissonRegressor,1353.699,626.111,0.594,0.594,2298.408,875.56,0.488,0.492
RandomForestRegressor,642.572,254.58,0.908,0.909,2513.371,969.24,0.387,0.388
XGBRegressor,38.815,23.148,1.0,1.0,2538.856,1006.544,0.375,0.377
DecisionTreeRegressor,0.0,0.0,1.0,1.0,2675.318,900.566,0.306,0.306
HistGradientBoostingRegressor,1540.831,478.765,0.474,0.474,2686.676,1033.788,0.3,0.306
LinearRegression,1902.418,866.29,0.198,0.198,2738.447,1084.321,0.273,0.288
Lars,1902.418,866.29,0.198,0.198,2738.447,1084.321,0.273,0.288
Ridge,1902.423,865.483,0.198,0.198,2739.211,1084.123,0.272,0.287
LassoLars,1902.451,865.631,0.198,0.198,2739.344,1084.908,0.272,0.287


In [31]:
cls_test = ClassificationModels(X_train, X_test, y_train[:,0], y_test[:,0])
cls_test.run_evaluation()
print(cls_test.not_trained_)
print(cls_test.not_evaluated_)
cls_test.top_10_socres.to_csv("/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Results/04 - Domain Only/success/lazy_scores.csv")
cls_test.top_10_socres

Training models: 100%|██████████| 28/28 [00:01<00:00, 22.85it/s] 
Evaluating models: 100%|██████████| 27/27 [00:00<00:00, 142.83it/s]

The modelsNu-Support Vector Machine could not be trained.
The modelsKernel Ridge could not be evaluated.





Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC,Class-0 Acc,Class-1 Acc
Gaussian Naive Bayes,0.301205,1.0,0.226667,0.369565,0.613333,1.0,0.226667
Balanced Random Forest Classifier,0.650602,0.942308,0.653333,0.771654,0.639167,0.625,0.653333
Multinomial Naive Bayes,0.53012,0.928571,0.52,0.666667,0.5725,0.625,0.52
Easy Ensemble Classifier,0.554217,0.895833,0.573333,0.699187,0.474167,0.375,0.573333
Balanced Bagging Classifier,0.698795,0.903226,0.746667,0.817518,0.498333,0.25,0.746667
RUSBoost Classifier,0.686747,0.888889,0.746667,0.811594,0.435833,0.125,0.746667
Gaussian Process,0.86747,0.910256,0.946667,0.928105,0.535833,0.125,0.946667
XGBoost,0.86747,0.9,0.96,0.929032,0.48,0.0,0.96
Dummy Classifier,0.903614,0.903614,1.0,0.949367,0.5,0.0,1.0
Neural Network,0.903614,0.903614,1.0,0.949367,0.5,0.0,1.0


#### Classification Notes
> For the success model, based on the overall f1 scores, Balance RandomForest clssifier, Balanced Bagging Classifier are more suitable

## Further Finetuning of the topk models

### Collection Ratio

In [32]:
elastic_net = linear_model.ElasticNet(random_state=42)
elastic_net.fit(X_train, y_train[:,1])

in_y_pred = elastic_net.predict(X_train)
out_y_pred = elastic_net.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.91, 0.07130115546391358
Out-sample RMSE, R2: 1.76, 0.1028480223875966


In [33]:
lars = linear_model.Lars(random_state=42)
lars.fit(X_train, y_train[:,1])

in_y_pred = lars.predict(X_train)
out_y_pred = lars.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')


In-sample RMSE, R2: 1.89, 0.09140862442410158
Out-sample RMSE, R2: 1.80, 0.0659657930083386


In [34]:
lassolars = linear_model.LassoLars(random_state=42)
lassolars.fit(X_train, y_train[:,1])

in_y_pred = lassolars.predict(X_train)
out_y_pred = lassolars.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.92, 0.06338871105038024
Out-sample RMSE, R2: 1.77, 0.09455600120231111


In [35]:
lassolars_cv = linear_model.LassoLarsCV(cv=50)
lassolars_cv.fit(X_train, y_train[:,1])

in_y_pred = lassolars_cv.predict(X_train)
out_y_pred = lassolars_cv.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.91, 0.07828831308996498
Out-sample RMSE, R2: 1.77, 0.09851191120572766


In [36]:
lasso = linear_model.Lasso(random_state=42)
lasso.fit(X_train, y_train[:,1])

in_y_pred = lasso.predict(X_train)
out_y_pred = lasso.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,1], in_y_pred)):.2f}, {r2_score(y_train[:,1], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,1], out_y_pred)):.2f}, {r2_score(y_test[:,1], out_y_pred)}')

In-sample RMSE, R2: 1.92, 0.06338877663299414
Out-sample RMSE, R2: 1.77, 0.09455612690551085


In [37]:
joblib.dump(elastic_net, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/collection_ratio.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/collection_ratio.pkl']

### Number of Backers

In [38]:
poisson = linear_model.PoissonRegressor()
poisson.fit(X_train, y_train[:,2])

in_y_pred = poisson.predict(X_train)
out_y_pred = poisson.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,2], in_y_pred)):.2f}, {r2_score(y_train[:,2], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,2], out_y_pred)):.2f}, {r2_score(y_test[:,2], out_y_pred)}')

In-sample RMSE, R2: 1353.70, 0.5939054389935573
Out-sample RMSE, R2: 2298.41, 0.48763229033805033


In [39]:
joblib.dump(poisson, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/num_backers.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/num_backers.pkl']

In [40]:
rf_poisson = ensemble.RandomForestRegressor(random_state=42, criterion='poisson')
rf_poisson.fit(X_train, y_train[:,2])

in_y_pred = rf_poisson.predict(X_train)
out_y_pred = rf_poisson.predict(X_test)

print(f'In-sample RMSE, R2: {np.sqrt(mean_squared_error(y_train[:,2], in_y_pred)):.2f}, {r2_score(y_train[:,2], in_y_pred)}')
print(f'Out-sample RMSE, R2: {np.sqrt(mean_squared_error(y_test[:,2], out_y_pred)):.2f}, {r2_score(y_test[:,2], out_y_pred)}')

In-sample RMSE, R2: 640.81, 0.9089997831183414
Out-sample RMSE, R2: 2249.79, 0.5090788065713889


### Success

In [42]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_train, y_train[:,0])

y_pred_in = easy_ensemble.predict(X_train)
y_pred_out = easy_ensemble.predict(X_test)

def get_class_wise_acc(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0,0]/cm[0].sum(), cm[1,1]/cm[1].sum()

in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)

print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')

print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')

print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')
print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')


Overall In-sample Accuracy: 0.6394849785407726
Overall Out-sample Accuracy: 0.5542168674698795 

In-sample Class 0 Accuracy: 0.9782608695652174
Out-sample Class 0 Accuracy: 0.375 

Out-sample Class 1 Accuracy: 0.5733333333333334
In-sample Class 1 Accuracy: 0.6023809523809524


In [43]:
n_estimator_list = [75, 100, 500, 1000, 2000]

for n_estimators in n_estimator_list:
    balanced_rf = BalancedRandomForestClassifier(random_state=42, n_estimators=n_estimators)
    balanced_rf.fit(X_train, y_train[:,0])
    
    y_pred_in = balanced_rf.predict(X_train)
    y_pred_out = balanced_rf.predict(X_test)
    
    
    in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
    out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)
    
    print(f'Number of Estimators: {n_estimators}')
    print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
    print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')
    
    print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
    print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')
    
    print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')
    print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')
    print('-'*10)

Number of Estimators: 75
Overall In-sample Accuracy: 0.7081545064377682
Overall Out-sample Accuracy: 0.6746987951807228 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.625 

Out-sample Class 1 Accuracy: 0.68
In-sample Class 1 Accuracy: 0.6761904761904762
----------
Number of Estimators: 100
Overall In-sample Accuracy: 0.6888412017167382
Overall Out-sample Accuracy: 0.6506024096385542 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.625 

Out-sample Class 1 Accuracy: 0.6533333333333333
In-sample Class 1 Accuracy: 0.6547619047619048
----------
Number of Estimators: 500
Overall In-sample Accuracy: 0.6609442060085837
Overall Out-sample Accuracy: 0.7108433734939759 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.75 

Out-sample Class 1 Accuracy: 0.7066666666666667
In-sample Class 1 Accuracy: 0.6238095238095238
----------
Number of Estimators: 1000
Overall In-sample Accuracy: 0.6630901287553648
Overall Out-sample Accuracy: 0.7228915662650602 

In [44]:
balanced_rf_1000 = BalancedRandomForestClassifier(random_state=42, n_estimators=1000)
balanced_rf_1000.fit(X_train, y_train[:,0])

y_pred_in = balanced_rf_1000.predict(X_train)
y_pred_out = balanced_rf_1000.predict(X_test)

in_class_0_acc, in_class_1_acc = get_class_wise_acc(y_train[:,0], y_pred_in)
out_class_0_acc, out_class_1_acc = get_class_wise_acc(y_test[:,0], y_pred_out)

print(f'Number of Estimators: 1000')
print(f'Overall In-sample Accuracy: {accuracy_score(y_train[:,0], y_pred_in)}')
print(f'Overall Out-sample Accuracy: {accuracy_score(y_test[:,0], y_pred_out)} \n')

print(f'In-sample Class 0 Accuracy: {in_class_0_acc}')
print(f'Out-sample Class 0 Accuracy: {out_class_0_acc} \n')

print(f'Out-sample Class 1 Accuracy: {out_class_1_acc}')
print(f'In-sample Class 1 Accuracy: {in_class_1_acc}')

Number of Estimators: 1000
Overall In-sample Accuracy: 0.6630901287553648
Overall Out-sample Accuracy: 0.7228915662650602 

In-sample Class 0 Accuracy: 1.0
Out-sample Class 0 Accuracy: 0.75 

Out-sample Class 1 Accuracy: 0.72
In-sample Class 1 Accuracy: 0.6261904761904762


In [45]:
joblib.dump(balanced_rf_1000, '/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/success.pkl')

['/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Modelling/final_models/04_domain_only/success.pkl']