In [82]:
import os
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, confusion_matrix, classification_report 
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingClassifier, BaggingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import RandomizedSearchCV

from success_utils import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Create a dictionary to group countries by region
region_groups = {
    "North America": ["HI", "Mexico", "NV", "OK", "NM","CA", "NY", "MD", "DC", "WA", "CO", "MN", "TX", "SC", "GA", "KY", "PA", "MI", "OH", "OR", "NC", "IL", "AZ", "TN", "NJ", "UT", "VA", "ND", "VT", "IA", "ND", "VT", "LA", "CT", "RI"],
    "Europe": ["Martinique", "AL", "Cayman Islands", "Russia", "ME", "UK", "DE", "Poland", "Denmark", "Netherlands", "Norway", "Liechtenstein", "Austria", "Belgium", "Czech Republic", "Iceland", "Ireland", "France", "Spain", "Sweden", "Switzerland", "Italy", "Slovenia", "Greece"],
    "Asia": ["MO", "IN", "Japan", "China", "Hong Kong", "India", "Viet Nam", "Taiwan", "Afghanistan", "Israel","Sri Lanka", "Jordan"],
    "South America": ["Colombia", "Brazil"],
    "Africa": ["WI", "SD", "MA", "Ghana", "Ethiopia", "Liberia", "Rwanda"],
    "Oceania": ["AU", "NZ"],
    "Other": [ "United Arab Emirates", "Svalbard and Jan Mayen"]
}

noticable_entities = ['ORG', 'PERSON', 'DATE',
                        'CARDINAL', 'GPE', 'PRODUCT', 
                        'WORK_OF_ART','ORDINAL', 'MONEY',
                        'TIME', 'NORP']


data_path = '/workspaces/Crowdfunding-Social-Media-Drivers/Data/Kickstarter_only/final_data.csv'
success_data = pd.read_csv(data_path)
success_data['collection_ratio'] = success_data.collected.apply(decode_amount)/success_data.goal_amount.apply(decode_amount)


bins = [0, 7, 14, 30, 60]
labels = ['<1 week', '1-2 weeks', '2 weeks - 1 month', '1-2 months']
success_data['duration'] = pd.cut(success_data['duration'], bins=bins, labels=labels)
success_data= success_data[success_data.donate != 1].reset_index(drop=True)


success_weights = (1-success_data.success.value_counts()/len(success_data)).values


def get_region(country):
    for region, countries in region_groups.items():
        if country in countries:
            return region
    return "Other"

success_data['region'] = success_data.country.str.strip().apply(get_region)
success_data.num_backers = success_data.num_backers.str.replace(',', '').astype(int)

success_data = pd.get_dummies(
                    success_data,
                    columns=['category',
                              'fb_type',
                              'region', 
                              'duration'])


for entity in noticable_entities:
    success_data[f'fb_entity_{entity}'] = success_data.fb_entities_identified.fillna('None').str.split(',').apply(lambda entity_list: entity in entity_list).astype(int)
    

bool_cols = success_data.dtypes[success_data.dtypes == 'bool'].index.tolist()
success_data[bool_cols] = success_data[bool_cols].astype(int)
success_data.head()



Unnamed: 0,collected,goal_amount,num_backers,pledge_types,donate,project_we_love,success,start_month,start_day,population,country,Person,Team,num_projects,num_backed,verified,fb_page_name,fb_likes_at_posting,fb_followers_at_posting,fb_likes,fb_comments,fb_shares,fb_post_views,fb_entities_identified,fb_post_sponsored,fb_post_age,fb_page_age,fb_positive_reactions,fb_negative_reactions,fb_topic_0,fb_topic_1,fb_topic_2,fb_topic_3,fb_topic_4,fb_topic_5,fb_topic_6,fb_topic_7,fb_topic_8,fb_topic_9,fb_topic_10,collection_ratio,category_art,category_comics,category_crafts,category_dance,category_design,category_fashion,category_film_Video,category_food,category_games,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,fb_type_link,fb_type_photo,fb_type_video,region_Africa,region_Asia,region_Europe,region_North America,region_Oceania,region_Other,region_South America,duration_<1 week,duration_1-2 weeks,duration_2 weeks - 1 month,duration_1-2 months,fb_entity_ORG,fb_entity_PERSON,fb_entity_DATE,fb_entity_CARDINAL,fb_entity_GPE,fb_entity_PRODUCT,fb_entity_WORK_OF_ART,fb_entity_ORDINAL,fb_entity_MONEY,fb_entity_TIME,fb_entity_NORP
0,8.600799,8.49699,66,7.0,0.0,1.0,1,6,4,11.973093,MA,1,0,2,10,janos stone,Kickstarter,1589842,1587991,1,1,0,2,"ORG,PERSON,DATE",0,0,4839,1,0,6.6e-05,0.200046,1.4e-05,0.191961,3.4e-05,4.6e-05,8.6e-05,0.395427,8.6e-05,9.5e-05,0.177186,1.109388,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0
1,11.62675,10.886128,1462,9.0,0.0,1.0,1,7,1,15.936166,Japan,0,1,2,0,Maho Williams,Kickstarter,1589866,1588006,5,2,0,5,,0,2,4839,0,0,6.7e-05,0.170955,5.9e-05,2.9e-05,0.777889,8.2e-05,5.4e-05,4.7e-05,9.6e-05,4.5e-05,8.5e-05,2.09724,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,12.960635,10.819778,918,4.0,0.0,1.0,1,6,1,13.436372,WA,0,1,7,16,Marc Barros,Kickstarter,1589901,1588019,10,2,1,12,PRODUCT,0,2,4839,2,0,9.7e-05,1.8e-05,0.286745,0.122825,7.7e-05,4e-05,0.550016,7.2e-05,5.3e-05,5.1e-05,8.3e-05,8.50672,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,9.030855,11.021151,93,12.0,0.0,1.0,0,6,6,15.837972,UK,1,0,2,1,Akwasi Brenya-Mensa,Kickstarter,1590070,1588169,7,2,0,9,"FAC,NORP,PERSON,GPE",0,7,4839,0,2,8.3e-05,2e-05,0.373098,5.1e-05,2.7e-05,1.9e-05,7.1e-05,0.201138,9e-05,0.369785,7.9e-05,0.136655,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1
4,11.866416,9.903488,1047,8.0,0.0,1.0,1,6,1,11.564749,CA,1,0,3,88,Adam Lawson,Kickstarter,1589930,1588019,4,1,0,5,"CARDINAL,PERSON",0,8,4839,0,1,7e-05,1.5e-05,0.939378,7.1e-05,5.3e-05,3.7e-05,6.7e-05,1.3e-05,1.3e-05,7.7e-05,5.5e-05,7.12015,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0


In [83]:
dropping_columns = ['collected', 'verified', 'goal_amount', 'donate', 'project_we_love', 'country', 'fb_page_name', 'fb_entities_identified']

success_data.drop(columns=dropping_columns, axis=1, inplace=True)

In [84]:
success_data.dtypes

num_backers                     int64
pledge_types                  float64
success                         int64
start_month                     int64
start_day                       int64
population                    float64
Person                          int64
Team                            int64
num_projects                    int64
num_backed                      int64
fb_likes_at_posting             int64
fb_followers_at_posting         int64
fb_likes                        int64
fb_comments                     int64
fb_shares                       int64
fb_post_views                   int64
fb_post_sponsored               int64
fb_post_age                     int64
fb_page_age                     int64
fb_positive_reactions           int64
fb_negative_reactions           int64
fb_topic_0                    float64
fb_topic_1                    float64
fb_topic_2                    float64
fb_topic_3                    float64
fb_topic_4                    float64
fb_topic_5  

In [81]:
success_data.columns

Index(['num_backers', 'pledge_types', 'success', 'start_month', 'start_day',
       'population', 'Person', 'Team', 'num_projects', 'num_backed',
       'fb_likes_at_posting', 'fb_followers_at_posting', 'fb_likes',
       'fb_comments', 'fb_shares', 'fb_post_views', 'fb_post_sponsored',
       'fb_post_age', 'fb_page_age', 'fb_positive_reactions',
       'fb_negative_reactions', 'fb_topic_0', 'fb_topic_1', 'fb_topic_2',
       'fb_topic_3', 'fb_topic_4', 'fb_topic_5', 'fb_topic_6', 'fb_topic_7',
       'fb_topic_8', 'fb_topic_9', 'fb_topic_10', 'collection_ratio',
       'category_art', 'category_comics', 'category_crafts', 'category_dance',
       'category_design', 'category_fashion', 'category_film_Video',
       'category_food', 'category_games', 'category_journalism',
       'category_music', 'category_photography', 'category_publishing',
       'category_technology', 'category_theater', 'fb_type_link',
       'fb_type_photo', 'fb_type_video', 'region_Africa', 'region_Asia',
     

In [None]:
target_cols = [
    'success',
    'collection_ratio',
    'num_backers'
]

fb_cols = success_data.columns[success_data.columns.str.startswith('fb')].tolist()
