In [303]:
import os
import joblib
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import random
import sklearn
import imblearn
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier, RUSBoostClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor 
from sklearn import linear_model, svm, tree
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, RegressorMixin

# Set the seed for Python's random module
random.seed(42)

# Set the seed for NumPy
np.random.seed(42)


target_cols = [
    'success',
    'collection_ratio',
    'num_backers'
]

# Import data
df= pd.read_csv('/workspaces/Crowdfunding-Social-Media-Drivers/Data/final_datasets/domain_post.csv')

#load_scaler
scaler = MinMaxScaler()
scaler = joblib.load('./02_success_engagement/domain_post_scaler.pkl')

df.head()
X_train, X_test, y_train, y_test = train_test_split(df.drop(target_cols, axis=1), df[target_cols], 
                                                    test_size=0.2, random_state=42, stratify=df.success)

x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)


In [304]:
df_minority = df.copy()

In [305]:
df_minority.columns

Index(['pledge_types', 'start_month', 'start_day', 'population', 'Person',
       'Team', 'num_projects', 'num_backed', 'category_art', 'category_comics',
       'category_crafts', 'category_dance', 'category_design',
       'category_fashion', 'category_film_Video', 'category_food',
       'category_games', 'category_journalism', 'category_music',
       'category_photography', 'category_publishing', 'category_technology',
       'category_theater', 'region_Africa', 'region_Asia', 'region_Europe',
       'region_North America', 'region_Oceania', 'region_Other',
       'region_South America', 'duration_<1 week', 'duration_1-2 weeks',
       'duration_2 weeks - 1 month', 'duration_1-2 months',
       'fb_likes_at_posting', 'fb_followers_at_posting', 'fb_post_views',
       'fb_post_sponsored', 'fb_post_age', 'fb_page_age', 'fb_topic_0',
       'fb_topic_1', 'fb_topic_2', 'fb_topic_3', 'fb_topic_4', 'fb_topic_5',
       'fb_topic_6', 'fb_topic_7', 'fb_topic_8', 'fb_topic_9', 'fb_topic_10

In [306]:
smotenc= imblearn.over_sampling.SMOTENC(categorical_features=['Person',
                                        'Team', 'category_art', 'category_comics',
                                        'category_crafts', 'category_dance', 'category_design',
                                        'category_fashion', 'category_film_Video', 'category_food',
                                        'category_games', 'category_journalism', 'category_music',
                                        'category_photography', 'category_publishing', 'category_technology',
                                        'category_theater', 'region_Africa', 'region_Asia', 'region_Europe',
                                        'region_North America', 'region_Oceania', 'region_Other',
                                        'region_South America', 'duration_<1 week', 'duration_1-2 weeks',
                                        'duration_2 weeks - 1 month', 'duration_1-2 months',
                                        'fb_post_sponsored', 'fb_type_link', 'fb_type_photo', 'fb_type_video', 'fb_entity_ORG',
                                        'fb_entity_PERSON', 'fb_entity_DATE', 'fb_entity_CARDINAL',
                                        'fb_entity_GPE', 'fb_entity_PRODUCT', 'fb_entity_WORK_OF_ART',
                                        'fb_entity_ORDINAL', 'fb_entity_MONEY', 'fb_entity_TIME',
                                        'fb_entity_NORP',])

X_resampled, y_resampled = smotenc.fit_resample(df_minority.drop(target_cols, axis=1), df_minority['success'])

X_resampled = pd.DataFrame(X_resampled, columns=df_minority.drop(target_cols, axis=1).columns)
y_resampled = pd.DataFrame(y_resampled)

df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
df_resampled


Unnamed: 0,pledge_types,start_month,start_day,population,Person,Team,num_projects,num_backed,category_art,category_comics,...,fb_entity_DATE,fb_entity_CARDINAL,fb_entity_GPE,fb_entity_PRODUCT,fb_entity_WORK_OF_ART,fb_entity_ORDINAL,fb_entity_MONEY,fb_entity_TIME,fb_entity_NORP,success
0,7,6,4,0.007097,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,9,7,1,0.373577,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4,6,1,0.030669,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,12,6,6,0.338637,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,8,6,1,0.004717,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1561,12,6,2,0.758475,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1562,9,4,2,0.098038,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1563,30,10,0,0.315685,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1564,9,9,1,0.098155,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [307]:
categorical_features = ['Person',
                        'Team', 'category_art', 'category_comics',
                        'category_crafts', 'category_dance', 'category_design',
                        'category_fashion', 'category_film_Video', 'category_food',
                        'category_games', 'category_journalism', 'category_music',
                        'category_photography', 'category_publishing', 'category_technology',
                        'category_theater', 'region_Africa', 'region_Asia', 'region_Europe',
                        'region_North America', 'region_Oceania', 'region_Other',
                        'region_South America', 'duration_<1 week', 'duration_1-2 weeks',
                        'duration_2 weeks - 1 month', 'duration_1-2 months',
                        'fb_post_sponsored', 'fb_type_link', 'fb_type_photo', 'fb_type_video', 'fb_entity_ORG',
                        'fb_entity_PERSON', 'fb_entity_DATE', 'fb_entity_CARDINAL',
                        'fb_entity_GPE', 'fb_entity_PRODUCT', 'fb_entity_WORK_OF_ART',
                        'fb_entity_ORDINAL', 'fb_entity_MONEY', 'fb_entity_TIME',
                        'fb_entity_NORP',]

person_team = ['Person', 'Team']
category = [x for x in categorical_features if x.startswith('category')]
region = [x for x in categorical_features if x.startswith('region')]
duration = [x for x in categorical_features if x.startswith('duration')]
fb_post_type = [x for x in categorical_features if x.startswith('fb_type')]
fb_entity = [x for x in categorical_features if x.startswith('fb_entity')]


df_resampled.loc[df_resampled[df_resampled[region].sum(axis=1) == 0].index, 'region_North America'] = 1
null_category_index = df_resampled[df_resampled[category].sum(axis=1) ==0].index
random_add_category = np.random.choice(category, size=len(null_category_index), replace=True)
for i in range(len(null_category_index)):
    df_resampled.loc[null_category_index[i], random_add_category[i]] = 1

In [313]:
entity_index = df_resampled[df_resampled[fb_entity].sum(axis=1) > 4].index

In [275]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled.drop('success', axis=1), df_resampled['success'], 
                                                    test_size=0.2, random_state=42, stratify=df_resampled.success)

x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)

In [282]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Train and evaluate models
models = {
    'Logistic Regression': linear_model.LogisticRegression(),
    'Support Vector Machine': svm.SVC(),
    'Decision Tree': tree.DecisionTreeClassifier(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Balanced Random Forest': BalancedRandomForestClassifier(),
    'Balanced Bagging': BalancedBaggingClassifier(),
    'Easy Ensemble': EasyEnsembleClassifier(),
    'RUSBoost': RUSBoostClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

results = {}
model_dict={}
for model_name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    
    # Convert y_pred to binary
    y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]
    
    f1 = f1_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    accuracy = accuracy_score(y_test, y_pred_binary)
    
    results[model_name] = {
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy
    }
    model_dict[model_name] = model

  warn(
  warn(


In [283]:
pd.DataFrame(results).T.sort_values('F1 Score', ascending=False)

Unnamed: 0,F1 Score,Precision,Recall,Accuracy
XGBoost,0.936306,0.936306,0.936306,0.936306
Random Forest,0.926518,0.929487,0.923567,0.926752
Balanced Random Forest,0.909091,0.927152,0.89172,0.910828
Balanced Bagging,0.885906,0.93617,0.840764,0.89172
Gradient Boosting,0.873786,0.888158,0.859873,0.875796
Easy Ensemble,0.834951,0.848684,0.821656,0.83758
RUSBoost,0.834951,0.848684,0.821656,0.83758
Decision Tree,0.784983,0.845588,0.732484,0.799363
AdaBoost,0.719697,0.88785,0.605096,0.764331
Logistic Regression,0.693878,0.744526,0.649682,0.713376


In [285]:
from success_utils import *
save_model(model_dict['XGBoost'], 'domain_engagement_success_on_resampled')