In [224]:
import os
import warnings
import joblib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from lazypredict.Supervised import LazyRegressor, LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, accuracy_score, confusion_matrix, classification_report 
from sklearn import ensemble, linear_model, svm
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import RandomizedSearchCV

from success_utils import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Create a dictionary to group countries by region
region_groups = {
    "North America": ["HI", "Mexico", "NV", "OK", "NM","CA", "NY", "MD", "DC", "WA", "CO", "MN", "TX", "SC", "GA", "KY", "PA", "MI", "OH", "OR", "NC", "IL", "AZ", "TN", "NJ", "UT", "VA", "ND", "VT", "IA", "ND", "VT", "LA", "CT", "RI"],
    "Europe": ["Martinique", "AL", "Cayman Islands", "Russia", "ME", "UK", "DE", "Poland", "Denmark", "Netherlands", "Norway", "Liechtenstein", "Austria", "Belgium", "Czech Republic", "Iceland", "Ireland", "France", "Spain", "Sweden", "Switzerland", "Italy", "Slovenia", "Greece"],
    "Asia": ["MO", "IN", "Japan", "China", "Hong Kong", "India", "Viet Nam", "Taiwan", "Afghanistan", "Israel","Sri Lanka", "Jordan"],
    "South America": ["Colombia", "Brazil"],
    "Africa": ["WI", "SD", "MA", "Ghana", "Ethiopia", "Liberia", "Rwanda"],
    "Oceania": ["AU", "NZ"],
    "Other": [ "United Arab Emirates", "Svalbard and Jan Mayen"]
}

noticable_entities = ['ORG', 'PERSON', 'DATE',
                        'CARDINAL', 'GPE', 'PRODUCT', 
                        'WORK_OF_ART','ORDINAL', 'MONEY',
                        'TIME', 'NORP']


data_path = '/workspaces/Crowdfunding-Social-Media-Drivers/Data/Kickstarter_only/final_data.csv'
success_data = pd.read_csv(data_path)
success_data['collection_ratio'] = success_data.collected.apply(decode_amount)/success_data.goal_amount.apply(decode_amount)


bins = [0, 7, 14, 30, 60]
labels = ['<1 week', '1-2 weeks', '2 weeks - 1 month', '1-2 months']
success_data['duration'] = pd.cut(success_data['duration'], bins=bins, labels=labels)
success_data= success_data[success_data.donate != 1].reset_index(drop=True)


success_weights = (1-success_data.success.value_counts()/len(success_data)).values


def get_region(country):
    for region, countries in region_groups.items():
        if country in countries:
            return region
    return "Other"

success_data['region'] = success_data.country.str.strip().apply(get_region)
success_data.num_backers = success_data.num_backers.str.replace(',', '').astype(int)

success_data = pd.get_dummies(
                    success_data,
                    columns=['category',
                              'fb_type',
                              'region', 
                              'duration'])


for entity in noticable_entities:
    success_data[f'fb_entity_{entity}'] = success_data.fb_entities_identified.fillna('None').str.split(',').apply(lambda entity_list: entity in entity_list).astype(int)
    

bool_cols = success_data.dtypes[success_data.dtypes == 'bool'].index.tolist()
success_data[bool_cols] = success_data[bool_cols].astype(int)

#ecoding the number of backers
success_data.num_backers = encode_targets(success_data.num_backers)
success_data.population = np.exp(success_data.population)

features_to_scale = ['population', 'num_projects', 'num_backed']
scaler = MinMaxScaler()
scaler.fit(success_data[features_to_scale])
scaled_features = scaler.transform(success_data[features_to_scale])
success_data[features_to_scale] = scaled_features

success_data.head()



Unnamed: 0,collected,goal_amount,num_backers,pledge_types,donate,project_we_love,success,start_month,start_day,population,country,Person,Team,num_projects,num_backed,verified,fb_page_name,fb_likes_at_posting,fb_followers_at_posting,fb_likes,fb_comments,fb_shares,fb_post_views,fb_entities_identified,fb_post_sponsored,fb_post_age,fb_page_age,fb_positive_reactions,fb_negative_reactions,fb_topic_0,fb_topic_1,fb_topic_2,fb_topic_3,fb_topic_4,fb_topic_5,fb_topic_6,fb_topic_7,fb_topic_8,fb_topic_9,fb_topic_10,collection_ratio,category_art,category_comics,category_crafts,category_dance,category_design,category_fashion,category_film_Video,category_food,category_games,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,fb_type_link,fb_type_photo,fb_type_video,region_Africa,region_Asia,region_Europe,region_North America,region_Oceania,region_Other,region_South America,duration_<1 week,duration_1-2 weeks,duration_2 weeks - 1 month,duration_1-2 months,fb_entity_ORG,fb_entity_PERSON,fb_entity_DATE,fb_entity_CARDINAL,fb_entity_GPE,fb_entity_PRODUCT,fb_entity_WORK_OF_ART,fb_entity_ORDINAL,fb_entity_MONEY,fb_entity_TIME,fb_entity_NORP
0,8.6,8.5,4.2,7.0,0.0,1.0,1,6,4,0.01,MA,1,0,0.02,0.01,janos stone,Kickstarter,1589842,1587991,1,1,0,2,"ORG,PERSON,DATE",0,0,4839,1,0,0.0,0.2,0.0,0.19,0.0,0.0,0.0,0.4,0.0,0.0,0.18,1.11,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0
1,11.63,10.89,7.29,9.0,0.0,1.0,1,7,1,0.37,Japan,0,1,0.02,0.0,Maho Williams,Kickstarter,1589866,1588006,5,2,0,5,,0,2,4839,0,0,0.0,0.17,0.0,0.0,0.78,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,12.96,10.82,6.82,4.0,0.0,1.0,1,6,1,0.03,WA,0,1,0.09,0.02,Marc Barros,Kickstarter,1589901,1588019,10,2,1,12,PRODUCT,0,2,4839,2,0,0.0,0.0,0.29,0.12,0.0,0.0,0.55,0.0,0.0,0.0,0.0,8.51,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,9.03,11.02,4.54,12.0,0.0,1.0,0,6,6,0.34,UK,1,0,0.02,0.0,Akwasi Brenya-Mensa,Kickstarter,1590070,1588169,7,2,0,9,"FAC,NORP,PERSON,GPE",0,7,4839,0,2,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.2,0.0,0.37,0.0,0.14,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1
4,11.87,9.9,6.95,8.0,0.0,1.0,1,6,1,0.0,CA,1,0,0.03,0.13,Adam Lawson,Kickstarter,1589930,1588019,4,1,0,5,"CARDINAL,PERSON",0,8,4839,0,1,0.0,0.0,0.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.12,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0


In [225]:
dropping_columns = ['collected', 'verified', 'goal_amount', 'donate', 'project_we_love', 'country', 'fb_page_name', 'fb_entities_identified']

success_data.drop(columns=dropping_columns, axis=1, inplace=True)

In [226]:
target_cols = [
    'success',
    'collection_ratio',
    'num_backers'
]

fb_cols = success_data.columns[success_data.columns.str.startswith('fb')].tolist()
common_features= success_data.columns[~success_data.columns.str.startswith('fb')].tolist()
common_features = [feature for feature in common_features if feature not in target_cols]

engagement_features = [ 'fb_likes', 'fb_comments', 'fb_shares','fb_positive_reactions','fb_negative_reactions']
post_fb_features = [feature for feature in fb_cols if feature not in engagement_features]


In [227]:
domain_page_features = success_data[common_features]
domain_engagement_features = success_data[common_features + engagement_features]
domain_post_features = success_data[common_features + post_fb_features]

targets = success_data[target_cols]

In [228]:
print(common_features)

['pledge_types', 'start_month', 'start_day', 'population', 'Person', 'Team', 'num_projects', 'num_backed', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film_Video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'region_Africa', 'region_Asia', 'region_Europe', 'region_North America', 'region_Oceania', 'region_Other', 'region_South America', 'duration_<1 week', 'duration_1-2 weeks', 'duration_2 weeks - 1 month', 'duration_1-2 months']


# Modelling

In [229]:
domain_engagement = pd.read_csv('/workspaces/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_engagement.csv')

In [230]:
train, test = train_test_split(domain_engagement, test_size=0.2, random_state=42, stratify=domain_engagement.success)

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train_weights = (1-train.success.value_counts()/len(train)).values

train_X = train.drop(columns=['success', 'collection_ratio', 'num_backers'], axis=1)
train_y = train[['success', 'collection_ratio', 'num_backers']]
test_X = test.drop(columns=['success', 'collection_ratio', 'num_backers'], axis=1)
test_y = test[['success', 'collection_ratio', 'num_backers']]


train_weights = [train_weights[0] if train_y.success.iloc[i] == 1 else train_weights[1] for i in range(len(train_y.success))]

invalid_chars = ['[', ']', '<']
for char in invalid_chars:
    train_X.columns = train_X.columns.str.replace(char, '')
    test_X.columns = test_X.columns.str.replace(char, '')


Selected models [Using LazyPredict]
* Classifier:
    * XGB
    * SVC
    * LinearSVC
    * CalibratedClassifierCV
    * RandomForestClassifier

* Num_backers:
    * Gamma
    * XGB
    * LassoLarsCV
    * LassoLars
    * ElasticNet

* collection_ratio:
    * PassiveAggressiveRegressor
    * HuberRegressor
    * LinearSVR
    * NuSCR
