In [28]:
# pip install pandas numpy scikit-learn tensorflow joblib


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import joblib
import os
from tensorflow.keras import regularizers

# Load dataset
df = pd.read_csv('./dataset_transformed.csv')
epoch = 1000
l2_penalty = 0.001

# Drop rows with missing values in critical columns
df.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns
label_encoders = {}
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df['status'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df['funding_total_usd'] = df['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X = df.drop(columns=['status'])
y = df['status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("The transformed input-values are:\n\n", X)

def build_model(input_dim, focus='general', model_type='mlp'):
    model = Sequential()

    if model_type == 'mlp':
        if focus == 'financial':
            model.add(Dense(30, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(15, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'market':
            model.add(Dense(20, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(15, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'team':
            model.add(Dense(14, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(9, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'innovation':
            model.add(Dense(18, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(11, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        else:
            model.add(Dense(17, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

    elif model_type == 'lstm':
        model.add(tf.keras.layers.Reshape((input_dim, 1), input_shape=(input_dim,)))
        model.add(LSTM(15, activation='relu', input_shape=(input_dim, 1)))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

    elif model_type == 'cnn':
        model.add(tf.keras.layers.Reshape((input_dim, 1), input_shape=(input_dim,)))
        model.add(Conv1D(13, 2, activation='relu', input_shape=(input_dim, 1), kernel_regularizer=regularizers.l2(l2_penalty)))
        model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

def build_random_forest():
    return RandomForestClassifier(n_estimators=20, random_state=42)

# Define different market niches (e.g., by category_list)
market_niches = df['category_list'].unique()

# Prepare a dictionary to store models for each market niche
niche_models = {}
for niche in market_niches:
    niche_indices = df['category_list'] == niche
    X_niche = X[niche_indices]
    y_niche = y[niche_indices]

    # Check if there are enough samples to split
    if len(X_niche) < 10:
        print(f"Skipping niche {niche} due to insufficient data.")
        continue

    # Split the data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_niche, y_niche, test_size=0.3, random_state=42)

    # Initialize models for each focus and type
    niche_models[niche] = {
        'mlp_financial': build_model(X_train.shape[1], focus='financial', model_type='mlp'),
        'mlp_market': build_model(X_train.shape[1], focus='market', model_type='mlp'),
        'lstm': build_model(X_train.shape[1], model_type='lstm'),
        'cnn': build_model(X_train.shape[1], model_type='cnn'),
        'random_forest': build_random_forest()
    }

    # Train each model
    for model_name, model in niche_models[niche].items():
        print(f"Training model {model_name} for niche {niche}...")
        if model_name == 'random_forest':
            model.fit(X_train, y_train)
        else:
            checkpoint_path = f"model_checkpoints/{niche}_{model_name}_checkpoint.weights.h5"
            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
            checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            history = model.fit(X_train, y_train, epochs=epoch, batch_size=30, validation_split=0.2, verbose=1, callbacks=[checkpoint, early_stopping])
            print(f"Model {model_name} training complete for niche {niche}.")
            print(f"  Final training loss: {history.history['loss'][-1]}")
            print(f"  Final validation loss: {history.history['val_loss'][-1]}")
            print(f"  Final training accuracy: {history.history['accuracy'][-1]}")
            print(f"  Final validation accuracy: {history.history['val_accuracy'][-1]}")

# Evaluate models on the test set for each niche
niche_performance = {}

for niche, models in niche_models.items():
    niche_indices = df['category_list'] == niche
    X_niche = X[niche_indices]
    y_niche = y[niche_indices]

    if len(X_niche) < 10:
        continue

    X_train, X_test, y_train, y_test = train_test_split(X_niche, y_niche, test_size=0.3, random_state=42)

    niche_performance[niche] = {}

    for model_name, model in models.items():
        if model_name == 'random_forest':
            y_pred = model.predict_proba(X_test)[:, 1]
            eval_result = [log_loss(y_test, y_pred), accuracy_score(y_test, model.predict(X_test))]
        else:
            eval_result = model.evaluate(X_test, y_test, verbose=0)

        niche_performance[niche][model_name] = eval_result

# Print performance for each niche
for niche, performance in niche_performance.items():
    print(f"Performance for niche {niche}:")
    for model_name, result in performance.items():
        print(f"  {model_name}: Loss = {result[0]}, Accuracy = {result[1]}")

# Compare specialized models with the baseline for each niche
best_models = {}

for niche, performance in niche_performance.items():
    best_model_name = min(performance, key=lambda k: performance[k][0])  # assuming lower loss is better
    best_models[niche] = best_model_name

    print(f"Best performing model for niche {niche}: {best_model_name} with performance: {performance[best_model_name]}")

# Save the best models
for niche, best_model_name in best_models.items():
    best_model = niche_models[niche][best_model_name]
    if best_model_name == 'random_forest':
        joblib.dump(best_model, f'best_model_{niche}.pkl')
    else:
        best_model.save(f'best_model_{niche}.h5')


The transformed input-values are:

 [[ 0.08915953 -0.18514222 -1.91104785 ... -0.69005976 -1.03768249
  -1.63025802]
 [ 1.41067941 -0.20808567  0.63247597 ...  0.05830927  1.17438633
   0.75892076]
 [-0.57160041 -0.19933445 -2.02163584 ... -1.30314011 -0.5881773
  -1.07881043]
 ...
 [-0.57160041 -0.17726573 -2.39026248 ... -0.69005976  0.60707189
   0.11642018]
 [-1.23236035 -0.19523734 -1.35810788 ...  1.7348614   1.04971441
   1.02310263]
 [-1.56274032 -0.20677441 -1.24751989 ...  1.03101776  0.14841646
   0.5870743 ]]
Training model mlp_financial for niche 5...
Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6516 - loss: 0.6898 - val_accuracy: 0.8498 - val_loss: 0.5970
Epoch 2/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7710 - loss: 0.6158 - val_accuracy: 0.8541 - val_loss: 0.5312
Epoch 3/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7965 - loss: 0.5630 - val_accuracy: 0.8541 - val_loss: 0.4878
Epoch 4/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8156 - loss: 0.5292 - val_accuracy: 0.8541 - val_loss: 0.4538
Epoch 5/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8148 - loss: 0.5053 - val_accuracy: 0.8541 - val_loss: 0.4328
Epoch 6/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8180 - loss: 0.5198 - val_accuracy: 0.8541 - val_loss: 0.4199
Epoch 7/1000
[1m31/31[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7974 - loss: 0.5584 - val_accuracy: 0.9266 - val_loss: 0.3487
Epoch 2/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9126 - loss: 0.3956 - val_accuracy: 0.9266 - val_loss: 0.2881
Epoch 3/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9231 - loss: 0.3149 - val_accuracy: 0.9266 - val_loss: 0.2766
Epoch 4/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9192 - loss: 0.3592 - val_accuracy: 0.9266 - val_loss: 0.2751
Epoch 5/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9312 - loss: 0.3084 - val_accuracy: 0.9266 - val_loss: 0.2757
Epoch 6/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9374 - loss: 0.2884 - val_accuracy: 0.9266 - val_loss: 0.2733
Epoch 7/1000
[1m59/59[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6524 - loss: 0.6961 - val_accuracy: 0.9807 - val_loss: 0.3661
Epoch 2/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9015 - loss: 0.4256 - val_accuracy: 0.9807 - val_loss: 0.2184
Epoch 3/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9428 - loss: 0.3335 - val_accuracy: 0.9807 - val_loss: 0.1617
Epoch 4/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9505 - loss: 0.3316 - val_accuracy: 0.9807 - val_loss: 0.1439
Epoch 5/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9579 - loss: 0.2728 - val_accuracy: 0.9807 - val_loss: 0.1349
Epoch 6/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9587 - loss: 0.2553 - val_accuracy: 0.9807 - val_loss: 0.1323
Epoch 7/1000
[1m56/56[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4672 - loss: 0.8561 - val_accuracy: 0.8579 - val_loss: 0.5471
Epoch 2/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7053 - loss: 0.6134 - val_accuracy: 0.9684 - val_loss: 0.3968
Epoch 3/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8292 - loss: 0.4597 - val_accuracy: 0.9684 - val_loss: 0.3072
Epoch 4/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8977 - loss: 0.4046 - val_accuracy: 0.9684 - val_loss: 0.2580
Epoch 5/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9286 - loss: 0.3304 - val_accuracy: 0.9684 - val_loss: 0.2285
Epoch 6/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9392 - loss: 0.3265 - val_accuracy: 0.9684 - val_loss: 0.2134
Epoch 7/1000
[1m26/26[0m [32m━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6234 - loss: 0.6752 - val_accuracy: 0.8920 - val_loss: 0.5513
Epoch 2/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8180 - loss: 0.5256 - val_accuracy: 0.8977 - val_loss: 0.4602
Epoch 3/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8536 - loss: 0.4682 - val_accuracy: 0.8977 - val_loss: 0.3948
Epoch 4/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8790 - loss: 0.4213 - val_accuracy: 0.8977 - val_loss: 0.3580
Epoch 5/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9134 - loss: 0.3677 - val_accuracy: 0.8977 - val_loss: 0.3462
Epoch 6/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9072 - loss: 0.3612 - val_accuracy: 0.8977 - val_loss: 0.3424
Epoch 7/1000
[1m24/24[0m [32m━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5466 - loss: 0.7707 - val_accuracy: 0.8944 - val_loss: 0.5273
Epoch 2/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8017 - loss: 0.5333 - val_accuracy: 0.9120 - val_loss: 0.3921
Epoch 3/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8782 - loss: 0.4328 - val_accuracy: 0.9120 - val_loss: 0.3351
Epoch 4/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8941 - loss: 0.4044 - val_accuracy: 0.9120 - val_loss: 0.3152
Epoch 5/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8936 - loss: 0.3719 - val_accuracy: 0.9120 - val_loss: 0.3061
Epoch 6/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9057 - loss: 0.3774 - val_accuracy: 0.9120 - val_loss: 0.3027
Epoch 7/1000
[1m38/38[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7372 - loss: 0.5615 - val_accuracy: 0.9296 - val_loss: 0.4331
Epoch 2/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8584 - loss: 0.4499 - val_accuracy: 0.9296 - val_loss: 0.3664
Epoch 3/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9106 - loss: 0.3659 - val_accuracy: 0.9296 - val_loss: 0.3367
Epoch 4/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9088 - loss: 0.3512 - val_accuracy: 0.9296 - val_loss: 0.3258
Epoch 5/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9249 - loss: 0.3652 - val_accuracy: 0.9296 - val_loss: 0.3230
Epoch 6/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9322 - loss: 0.3301 - val_accuracy: 0.9296 - val_loss: 0.3219
Epoch 7/1000
[1m19/19[0m [32m━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6759 - loss: 0.6557 - val_accuracy: 0.9055 - val_loss: 0.3938
Epoch 2/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8218 - loss: 0.5089 - val_accuracy: 0.9055 - val_loss: 0.3499
Epoch 3/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8740 - loss: 0.4235 - val_accuracy: 0.9055 - val_loss: 0.3332
Epoch 4/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8842 - loss: 0.3740 - val_accuracy: 0.9055 - val_loss: 0.3279
Epoch 5/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9004 - loss: 0.3855 - val_accuracy: 0.9055 - val_loss: 0.3252
Epoch 6/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9067 - loss: 0.3748 - val_accuracy: 0.9055 - val_loss: 0.3229
Epoch 7/1000
[1m37/37[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5825 - loss: 0.7042 - val_accuracy: 0.9342 - val_loss: 0.5142
Epoch 2/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7948 - loss: 0.5174 - val_accuracy: 0.9539 - val_loss: 0.4138
Epoch 3/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8528 - loss: 0.4783 - val_accuracy: 0.9539 - val_loss: 0.3502
Epoch 4/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8808 - loss: 0.4269 - val_accuracy: 0.9539 - val_loss: 0.3064
Epoch 5/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9187 - loss: 0.3786 - val_accuracy: 0.9539 - val_loss: 0.2775
Epoch 6/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9257 - loss: 0.3459 - val_accuracy: 0.9539 - val_loss: 0.2540
Epoch 7/1000
[1m21/21[0m [32m━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.7847 - loss: 0.5890 - val_accuracy: 0.9375 - val_loss: 0.5092
Epoch 2/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8832 - loss: 0.4969 - val_accuracy: 0.9375 - val_loss: 0.4343
Epoch 3/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8863 - loss: 0.4306 - val_accuracy: 0.9375 - val_loss: 0.3756
Epoch 4/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9049 - loss: 0.3924 - val_accuracy: 0.9375 - val_loss: 0.3309
Epoch 5/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9303 - loss: 0.3675 - val_accuracy: 0.9375 - val_loss: 0.3014
Epoch 6/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9329 - loss: 0.3474 - val_accuracy: 0.9375 - val_loss: 0.2835
Epoch 7/1000
[1m17/17[0m [32m━━━━━━━━



Performance for niche 5:
  mlp_financial: Loss = 0.39850395917892456, Accuracy = 0.8336673378944397
  mlp_market: Loss = 0.39497047662734985, Accuracy = 0.8336673378944397
  lstm: Loss = 0.38005658984184265, Accuracy = 0.8336673378944397
  cnn: Loss = 0.3774825930595398, Accuracy = 0.8336673378944397
  random_forest: Loss = 0.9972323522438671, Accuracy = 0.811623246492986
Performance for niche 9:
  mlp_financial: Loss = 0.22482499480247498, Accuracy = 0.936898410320282
  mlp_market: Loss = 0.22250530123710632, Accuracy = 0.936898410320282
  lstm: Loss = 0.21920959651470184, Accuracy = 0.936898410320282
  cnn: Loss = 0.2192772626876831, Accuracy = 0.936898410320282
  random_forest: Loss = 0.5052295267083335, Accuracy = 0.9251336898395722
Performance for niche 3:
  mlp_financial: Loss = 0.19625836610794067, Accuracy = 0.9527027010917664
  mlp_market: Loss = 0.19972948729991913, Accuracy = 0.9527027010917664
  lstm: Loss = 0.19154340028762817, Accuracy = 0.9527027010917664
  cnn: Loss = 0

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os

# Load dataset
df = pd.read_csv('./dataset_transformed.csv')

# Drop rows with missing values in critical columns
df.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns
label_encoders = {}
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df['status'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df['funding_total_usd'] = df['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X = df.drop(columns=['status'])
y = df['status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Define the MLP model
def build_mlp_model(input_dim):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_dim=input_dim , kernel_regularizer=regularizers.l2(l2_penalty)))
    model.add(Dropout(0.5))
    model.add(Dense(25, activation='relu' , kernel_regularizer=regularizers.l2(l2_penalty)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the MLP model
mlp_model = build_mlp_model(X_train.shape[1])

# Define callbacks for early stopping and model checkpointing
checkpoint_path = "./mlp_model_checkpoint.weights.h5"
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = mlp_model.fit(X_train, y_train, epochs=epoch, batch_size=32, validation_split=0.2, verbose=1, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = mlp_model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Save the final model
mlp_model.save('mlp_model.h5')

print("MLP model training complete and saved.")


Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8724 - loss: 0.4344 - val_accuracy: 0.9189 - val_loss: 0.2959
Epoch 2/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9256 - loss: 0.3079 - val_accuracy: 0.9189 - val_loss: 0.2849
Epoch 3/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9256 - loss: 0.2949 - val_accuracy: 0.9189 - val_loss: 0.2800
Epoch 4/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9258 - loss: 0.2854 - val_accuracy: 0.9189 - val_loss: 0.2765
Epoch 5/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9278 - loss: 0.2746 - val_accuracy: 0.9189 - val_loss: 0.2730
Epoch 6/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9289 - loss: 0.2668 - val_accuracy: 0.9189 - val_loss: 0.2706
Epoch 7/1000
[1m304/304[0m 



Test Loss: 0.23735898733139038
Test Accuracy: 0.9277201294898987
MLP model training complete and saved.


In [21]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os
import joblib


df = pd.read_csv("./dataset_transformed.csv")


# df_software = df[df['category_list'] == label_encoders['category_list'].transform(['Apps'])[0]]

# Split the "Software" category data into training and test sets
df_train_software, df_test_software = train_test_split(df, test_size=0.7, random_state=56)

# Save the test dataset to a CSV file
test_software_file_path = './test_dataset.csv'
df_test_software.to_csv(test_software_file_path, index=False)

In [23]:
print("Evaluating :")
# Load the models
model_1 = tf.keras.models.load_model('./mlp_model.h5')
model_2 = tf.keras.models.load_model('./best_model_1.h5')

# Load label encoders and scaler
label_encoders = joblib.load('./label_encoders.pkl')
scaler = joblib.load('./scaler.pkl')

# Load test dataset
df_test = pd.read_csv('./test_dataset.csv')

# Drop rows with missing values in critical columns
df_test.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns using the existing label encoders
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = label_encoders[column]
    df_test[column] = df_test[column].astype(str)
    df_test[column] = df_test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df_test['status'] = df_test['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df_test['funding_total_usd'] = df_test['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X_test = df_test.drop(columns=['status'])
y_test = df_test['status']

# Standardize the features
X_test = scaler.transform(X_test)

# Evaluate model_1
loss_1, accuracy_1 = model_1.evaluate(X_test, y_test, verbose=0)
print(f"General - Test Loss: {loss_1}, Test Accuracy: {accuracy_1}")

# Evaluate model_2
loss_2, accuracy_2 = model_2.evaluate(X_test, y_test, verbose=0)
print(f"Specialized - Test Loss: {loss_2}, Test Accuracy: {accuracy_2}")




Evaluating :
General - Test Loss: 0.24369636178016663, Test Accuracy: 0.9240339398384094
Specialized - Test Loss: 0.26028895378112793, Test Accuracy: 0.9240339398384094


In [18]:
# #Visulaizing the model 
# import visualkeras
# model_1 = tf.keras.models.load_model('./mlp_model.h5')
# model_2 = tf.keras.models.load_model('./best_model_0.h5')
# visualkeras.layered_view(model_2, type_ignore=[ Dropout])
# visualkeras.layered_view(model_2, to_file='model_specilized.png') # write to disk
# visualkeras.layered_view(model_2, to_file='model_specilized.png').show() # write and show

In [9]:

# Evaluate model_1
loss_1, accuracy_1 = model_1.evaluate(X_test, y_test, verbose=0)
print(f"General - Test Loss: {loss_1}, Test Accuracy: {accuracy_1}")

# Evaluate model_2
loss_2, accuracy_2 = model_2.evaluate(X_test, y_test, verbose=0)
print(f"Specialized - Test Loss: {loss_2}, Test Accuracy: {accuracy_2}")

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score

# Make predictions
y_pred_1 = (model_1.predict(X_test) > 0.5).astype("int32")
y_pred_2 = (model_2.predict(X_test) > 0.5).astype("int32")

print("Model 1 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_1))
print("Model 1 Classification Report:")
print(classification_report(y_test, y_pred_1))
print("Model 1 Precision:", precision_score(y_test, y_pred_1))
print("Model 1 Recall:", recall_score(y_test, y_pred_1))
print("Model 1 F1 Score:", f1_score(y_test, y_pred_1))

# Confusion Matrix and Classification Report for Model 2
print("Model 2 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_2))
print("Model 2 Classification Report:")
print(classification_report(y_test, y_pred_2))
print("Model 2 Precision:", precision_score(y_test, y_pred_2))
print("Model 2 Recall:", recall_score(y_test, y_pred_2))
print("Model 2 F1 Score:", f1_score(y_test, y_pred_2))

General - Test Loss: 0.248529314994812, Test Accuracy: 0.9225298166275024
Specialized - Test Loss: 0.5699695944786072, Test Accuracy: 0.9225298166275024
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Model 1 Confusion Matrix:
[[   0  403]
 [   0 4799]]
Model 1 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       403
           1       0.92      1.00      0.96      4799

    accuracy                           0.92      5202
   macro avg       0.46      0.50      0.48      5202
weighted avg       0.85      0.92      0.89      5202

Model 1 Precision: 0.9225297962322184
Model 1 Recall: 1.0
Model 1 F1 Score: 0.9597040295970403
Model 2 Confusion Matrix:
[[   0  403]
 [   0 4799]]
Model 2 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       40

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
