In [33]:
# imports
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import make_scorer


from helper_functions import load_data
from helper_functions import my_split
from helper_functions import downsample_majority
from helper_functions import model_prep
from helper_functions import get_results
from helper_functions import get_f1

In [2]:
# loading data
df = load_data()
df.head()

Unnamed: 0,name,desc,goal,disable_communication,country,currency,deadline,launched_at,final_status,campaign_length,launch_year,launch_month,launch_day,launch_weekday
0,drawing for dollars,I like drawing pictures. and then i color them...,20.0,0,US,USD,2009-05-03 02:59:59,2009-04-24 15:52:03,1,8,2009,4,24,4
1,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,0,US,USD,2009-05-15 19:10:00,2009-04-28 23:26:32,0,17,2009,4,28,1
2,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,0,US,USD,2009-05-22 17:26:00,2009-05-12 17:39:58,0,10,2009,5,12,1
3,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,0,US,USD,2009-05-28 20:09:00,2009-04-28 20:58:50,1,30,2009,4,28,1
4,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,0,US,USD,2009-05-31 07:38:00,2009-05-01 08:22:21,0,30,2009,5,1,4


In [3]:
# splitting data into test and train
year = 2015
train, test = my_split(df, year)

In [4]:
# selecting features and target
features = ['goal', 
        'campaign_length', 
        'launch_month', 
        'launch_day', 
        'launch_weekday', 
        'disable_communication',
        'country', 
        'currency']

target = 'final_status'
epochs = 20

In [5]:
# basic neural network - no change to class balance
X_train, y_train, X_test, y_test = model_prep(train, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(10, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores with no change
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.723826714801444
ROC AUC Score: 0.5309832634963456
F1 Score: 0.22408874801901743


In [45]:
# basic neural network - downsampling minority class
train_downsampled = downsample_majority(train)

X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(10, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores with upsampling
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.447202166064982
ROC AUC Score: 0.5706145712272757
F1 Score: 0.4004649455524287


In [42]:
# basic neural network - downsampling majority class
train_downsampled = downsample_majority(train)
X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(10, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores with downsampling
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.4565658844765343
ROC AUC Score: 0.5707148606400113
F1 Score: 0.39937655860349136


In [43]:
# adding layers - downsampling majority class
train_downsampled = downsample_majority(train)

X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(30, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

# fitting the model
history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.5073330324909747
ROC AUC Score: 0.5762631806382221
F1 Score: 0.39757207890743557


In [44]:
# adding layers, massive amount of neurons - downsampling majority class
train_downsampled = downsample_majority(train)

X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

# fitting the model
history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.4551556859205776
ROC AUC Score: 0.5760412750791634
F1 Score: 0.4041700080192462


In [10]:
# adding layers, massive amount of neurons - downsampling majority class
train_downsampled = downsample_majority(train)

X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(300,  activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(300,  activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(300,  activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', get_f1]
)

# fitting the model
history = model.fit(x=X_train, 
          y=y_train, 
          epochs=epochs,
          validation_data=(X_test, y_test) 
          )

# scores
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-------------------------------
Accuracy Score: 0.4834724729241877
ROC AUC Score: 0.5802517816245709
F1 Score: 0.4045002276126683


In [46]:
# grid search, this will return accuracy but that should hopefully also equal a higher f1
# when trained on a balanced set
train_downsampled = downsample_majority(train)
X_train, y_train, X_test, y_test = model_prep(train_downsampled, test, features, target)
input_dim = X_train.shape[1]

def create_model(lr, optimizer='adam', number_layers=1,
        neurons1=1, act1='relu', init_mode1='uniform', dropout_rate1=0.0,
        neurons2=1, act2='relu', init_mode2='uniform', dropout_rate2=0.0,
        neurons3=1, act3='relu', init_mode3='uniform', dropout_rate3=0.0,
        neurons4=1, act4='relu', init_mode4='uniform', dropout_rate4=0.0,
        neurons5=1, act5='relu', init_mode5='uniform', dropout_rate5=0.0,
        init_mode6='uniform'):
    adam = Adam(learning_rate=lr)
    model = Sequential()
    if number_layers <= 1:
        model.add(Dense(neurons1, activation=act1, kernel_initializer=init_mode1, input_shape=(input_dim,)))
        model.add(Dropout(dropout_rate1))

    if number_layers <= 2:
        model.add(Dense(neurons2, activation=act2, kernel_initializer=init_mode2))
        model.add(Dropout(dropout_rate2))

    if number_layers <= 3:
        model.add(Dense(neurons3, activation=act3, kernel_initializer=init_mode3))
        model.add(Dropout(dropout_rate3))

    if number_layers <= 4:
        model.add(Dense(neurons4, activation=act4, kernel_initializer=init_mode4))
        model.add(Dropout(dropout_rate4))

    if number_layers <= 5:
        model.add(Dense(neurons5, activation=act5, kernel_initializer=init_mode5))
        model.add(Dropout(dropout_rate5))

    model.add(Dense(1, activation='sigmoid', kernel_initializer=init_mode6))
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, verbose=10)


param_grid = {'batch_size': [50, 100],
            'epochs': [20],
            'lr': [.01, .1, .5], 
            'optimizer': ['adam'], 
            'number_layers': [1, 2, 3], 
            'neurons1': [1, 5, 10, 20], 
            'act1': ['relu'], 
            'init_mode1': ['uniform'], 
            'dropout_rate1': [0, .1, .3],   
            'neurons2': [1, 5, 10, 20], 
            'act2': ['relu'], 
            'init_mode2': ['uniform'], 
            'dropout_rate2': [0, .1, .3],   
            'neurons3': [1, 5, 10, 20], 
            'act3': ['relu'], 
            'init_mode3': ['uniform'], 
            'dropout_rate3': [0, .1, .3],
            #'neurons4': [1, 5, 10, 20],   # commenting these out to reduce run time
            #'act4': ['relu'], 
            #'init_mode4': ['uniform'], 
            #'dropout_rate4': [0, .1, .3],
            #'neurons5': [1, 5, 10, 20], 
            #'act5': ['relu'], 
            #'init_mode5': ['uniform'], 
            #'dropout_rate5': [0, .1, .3],
            'init_mode6': ['uniform']}

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=4)
grid_result = grid.fit(X_train, y_train)

print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")