In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def prepare_data():
    # Load the dataset
    df = pd.read_csv('../Dataset/projects.csv')
    df_original = df.copy()
    # df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split the data into training, validation, and test sets
    X = df.drop(columns=['state', 'name'])
    y = df['state']
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42, test_size=0.1, stratify=y_train_val)

    # Return all of the variables in a list
    return [X_train, y_train, X_val, y_val, X_test, y_test]

splitted_data = prepare_data()
print('Shape of splitted_data', splitted_data[0].shape)

Shape of splitted_data (272635, 6)


In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

def ohe_train_encode(X):
  ohe = OneHotEncoder(sparse_output=False)
  X_ohe = ohe.fit_transform(X)
  std = StandardScaler()
  X_ohe_scl = std.fit_transform(X_ohe)
  return X_ohe_scl, ohe, std

def ohe_not_train_encode(X, ohe=None, std=None):
  X_ohe = ohe.transform(X)
  X_ohe_scl = std.transform(X_ohe)
  return X_ohe_scl

def cyclical_encode(df, col, max_val):
    df = df.copy()
    df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    df.drop(col, axis=1, inplace=True)
    return df


def train_label_encoder(y):
  lbl_enc = LabelEncoder()
  encoded_labels = lbl_enc.fit_transform(y)
  return encoded_labels, lbl_enc

def not_train_label_encoder(y, lbl_enc=None):
  encoded_labels = lbl_enc.transform(y)
  return encoded_labels

def fprepare_data(data_variables):

    X_train, y_train, X_val, y_val, X_test, y_test = data_variables
    
    y_train, lbl_enc = train_label_encoder(y_train)
    y_val = not_train_label_encoder(y_val, lbl_enc=lbl_enc)
    y_test = not_train_label_encoder(y_test, lbl_enc=lbl_enc)

    X_train_to_ohe = X_train[['category','main_category', 'currency' ]].astype(str)

    X_val_to_ohe = X_val[['category','main_category', 'currency' ]].astype(str)

    X_test_to_ohe = X_test[['category','main_category', 'currency' ]].astype(str)

    X_train['deadline'] = pd.to_datetime(X_train['deadline'])
    X_train['year_deadline'] = X_train['deadline'].dt.year
    X_train['month_deadline'] = X_train['deadline'].dt.month
    X_train['day_deadline'] = X_train['deadline'].dt.day
    X_train.drop('deadline', axis=1, inplace=True)

    X_train['launched'] = pd.to_datetime(X_train['launched'])
    X_train['year_launched'] = X_train['launched'].dt.year
    X_train['month_launched'] = X_train['launched'].dt.month
    X_train['day_launched'] = X_train['launched'].dt.day
    X_train['hour_launched'] = X_train['launched'].dt.hour
    X_train.drop('launched', axis=1, inplace=True)

    X_val['deadline'] = pd.to_datetime(X_val['deadline'])
    X_val['year_deadline'] = X_val['deadline'].dt.year
    X_val['month_deadline'] = X_val['deadline'].dt.month
    X_val['day_deadline'] = X_val['deadline'].dt.day
    X_val.drop('deadline', axis=1, inplace=True)

    X_val['launched'] = pd.to_datetime(X_val['launched'])
    X_val['year_launched'] = X_val['launched'].dt.year
    X_val['month_launched'] = X_val['launched'].dt.month
    X_val['day_launched'] = X_val['launched'].dt.day
    X_val['hour_launched'] = X_val['launched'].dt.hour
    X_val.drop('launched', axis=1, inplace=True)

    X_test['deadline'] = pd.to_datetime(X_test['deadline'])
    X_test['year_deadline'] = X_test['deadline'].dt.year
    X_test['month_deadline'] = X_test['deadline'].dt.month
    X_test['day_deadline'] = X_test['deadline'].dt.day
    X_test.drop('deadline', axis=1, inplace=True)

    X_test['launched'] = pd.to_datetime(X_test['launched'])
    X_test['year_launched'] = X_test['launched'].dt.year
    X_test['month_launched'] = X_test['launched'].dt.month
    X_test['day_launched'] = X_test['launched'].dt.day
    X_test['hour_launched'] = X_test['launched'].dt.hour
    X_test.drop('launched', axis=1, inplace=True)

    X_train_to_cyclical = X_train[['month_deadline', 'day_deadline', 'month_launched', 'day_launched', 'hour_launched']]
    X_train_to_cyclical.head()

    X_val_to_cyclical = X_val[['month_deadline', 'day_deadline', 'month_launched', 'day_launched', 'hour_launched']]
    X_val_to_cyclical.head()

    X_test_to_cyclical = X_test[['month_deadline', 'day_deadline', 'month_launched', 'day_launched', 'hour_launched']]
    X_test_to_cyclical.head()

    X_train_ohe, ohe, std = ohe_train_encode(X_train_to_ohe)
    X_val_ohe = ohe_not_train_encode(X_val_to_ohe, ohe=ohe, std=std)
    X_test_ohe = ohe_not_train_encode(X_test_to_ohe, ohe=ohe, std=std)

    X_train_month_deadline_encoded = cyclical_encode(X_train_to_cyclical[['month_deadline']], 'month_deadline', 12)
    X_val_month_deadline_encoded = cyclical_encode(X_val_to_cyclical[['month_deadline']], 'month_deadline', 12)
    X_test_month_deadline_encoded = cyclical_encode(X_test_to_cyclical[['month_deadline']], 'month_deadline', 12)

    X_train_month_launched_encoded = cyclical_encode(X_train_to_cyclical[['month_launched']], 'month_launched', 12)
    X_val_month_launched_encoded = cyclical_encode(X_val_to_cyclical[['month_launched']], 'month_launched', 12)
    X_test_month_launched_encoded = cyclical_encode(X_test_to_cyclical[['month_launched']], 'month_launched', 12)

    X_train_day_deadline_encoded = cyclical_encode(X_train_to_cyclical[['day_deadline']], 'day_deadline', 30)
    X_val_day_deadline_encoded = cyclical_encode(X_val_to_cyclical[['day_deadline']], 'day_deadline', 30)
    X_test_day_deadline_encoded = cyclical_encode(X_test_to_cyclical[['day_deadline']], 'day_deadline', 30)

    X_train_day_launched_encoded = cyclical_encode(X_train_to_cyclical[['day_launched']], 'day_launched', 30)
    X_val_day_launched_encoded = cyclical_encode(X_val_to_cyclical[['day_launched']], 'day_launched', 30)
    X_test_day_launched_encoded = cyclical_encode(X_test_to_cyclical[['day_launched']], 'day_launched', 30)

    X_train_hour_launched_encoded = cyclical_encode(X_train_to_cyclical[['hour_launched']], 'hour_launched', 24)
    X_val_hour_launched_encoded = cyclical_encode(X_val_to_cyclical[['hour_launched']], 'hour_launched', 24)
    X_test_hour_launched_encoded = cyclical_encode(X_test_to_cyclical[['hour_launched']], 'hour_launched', 24)
    
    X_train = np.concatenate((X_train_ohe, X_train_month_deadline_encoded,
                              X_train_month_launched_encoded,
                              X_train_day_deadline_encoded,
                              X_train_day_launched_encoded,
                              X_train_hour_launched_encoded
                              ), axis=1)
    
    X_val = np.concatenate((X_val_ohe, X_val_month_deadline_encoded,
                              X_val_month_launched_encoded,
                              X_val_day_deadline_encoded,
                              X_val_day_launched_encoded,
                              X_val_hour_launched_encoded
                              ), axis=1)
    
    X_test = np.concatenate((X_test_ohe, X_test_month_deadline_encoded,
                              X_test_month_launched_encoded,
                              X_test_day_deadline_encoded,
                              X_test_day_launched_encoded,
                              X_test_hour_launched_encoded
                              ), axis=1)
    
    # check if X_train and y_train have the same number of rows, if not, raise an error
    if X_train.shape[0] != y_train.shape[0]:
        raise ValueError('X_train and y_train must have the same number of rows')
    
    return X_train, X_val, X_test

features = fprepare_data(splitted_data)
#print shape
print('Shape of features: ', features[0].shape)

Shape of features:  (272635, 198)


In [4]:
from sklearn.metrics import classification_report
import tensorflow as tf

def prediction(model, X_test, y_test):
    y_pred = (model.predict(X_test) > 0.5) * 1
    print(classification_report(y_test, y_pred))

    # General metrics 
    from sklearn.metrics import precision_score, recall_score, f1_score
    print('Precision: ', round(precision_score(y_test, y_pred.astype(int), average='weighted'), 3))
    print('Recall: ', round(recall_score(y_test, y_pred.astype(int), average='weighted'), 3))
    print('F1 score: ', round(f1_score(y_test, y_pred.astype(int), average='weighted'), 3))

    # Save model
    tf.saved_model.save(model, 'saved_model')

In [5]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras_tuner.tuners import RandomSearch

def reset_seeds():
    os.environ['PYTHONHASHSEED'] = str(2)
    tf.random.set_seed(2)
    np.random.seed(2)
    random.seed(2)


def train_model(X_train, y_train, X_val, y_val):   
    
    def build_model(hp):
        model = keras.Sequential([
            keras.Input(shape=(X_train.shape[1],), name="input"),
            layers.BatchNormalization(),
            layers.Dense(hp.Int('units1', 16, 64, step=16), activation=keras.activations.selu, name="hidden_layer1", kernel_initializer=keras.initializers.LecunNormal),
            keras.layers.Dropout(rate=hp.Float('dropout2', 0, 0.5, step=0.1)),
            layers.BatchNormalization(),
            layers.Dense(hp.Int('units2', 8, 32, step=8), activation=keras.activations.selu, name="hidden_layer2", kernel_initializer=keras.initializers.LecunNormal),
            layers.Dense(1, activation="sigmoid", name="output")
        ])

        model.compile(
            optimizer=keras.optimizers.Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=['accuracy']
        )

        return model

    tuner = RandomSearch(
        build_model,
        objective='val_accuracy',
        max_trials=5,
        executions_per_trial=1,
        directory='tuner_dir',
        project_name='my_model'
    )
    tuner.search(X_train, y_train, epochs=20, validation_data=(X_val, y_val))

    # Retrieve best hyperparameters and fit the final model
    best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
    model = tuner.hypermodel.build(best_hyperparameters)
    model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

    # model = keras.Sequential(
    #     [
    #         keras.Input(shape=(X_train.shape[1],), name="input"),
    #         layers.BatchNormalization(),
    #         layers.Dense(8, activation=keras.activations.selu, name="hidden_layer2", kernel_initializer=keras.initializers.LecunNormal, kernel_regularizer='l1_l2'),
    #         layers.Dense(1, activation="sigmoid", name="output")
    #     ]
    # )

    # model.compile(
    #     optimizer=tf.keras.optimizers.Adam(learning_rate=keras.optimizers.schedules.ExponentialDecay(0.1, decay_steps=100, decay_rate=0.96, staircase=True)),
    #     loss=tf.keras.losses.BinaryCrossentropy(),
    #     metrics=['accuracy']
    # )

    # cb = tf.keras.callbacks.EarlyStopping(
    #     patience=5,
    #     restore_best_weights=True
    # )

    reset_seeds()

    # fitted_model = model.fit(
    #     x=X_train,
    #     y=y_train,
    #     batch_size=32,
    #     epochs=50,
    #     validation_data=(X_val, y_val),
    #     callbacks=cb, # use early stopping
    # )
    return model
    # return fitted_model, model


In [6]:
#train model
y_train = features[0][:, -1]
y_val = features[1][:, -1]
model = train_model(features[0], y_train, features[1], y_val)

INFO:tensorflow:Reloading Tuner from tuner_dir\my_model\tuner0.json

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
64                |64                |units1
0.4               |0.4               |dropout2
8                 |8                 |units2
0.0061613         |0.0061613         |learning_rate

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

KeyboardInterrupt: 