In [3]:
# pip install pandas numpy scikit-learn tensorflow joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import joblib
import os
from tensorflow.keras import regularizers


In [4]:

# Load dataset
df = pd.read_csv('./dataset_transformed.csv')
epoch = 1000
l2_penalty = 0.001

# Drop rows with missing values in critical columns
df.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns
label_encoders = {}
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df['status'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df['funding_total_usd'] = df['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X = df.drop(columns=['status'])
y = df['status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("The transformed input-values are:\n\n", X)

def build_model(input_dim, focus='general', model_type='mlp'):
    model = Sequential()

    if model_type == 'mlp':
        if focus == 'financial':
            model.add(Dense(30, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(15, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'market':
            model.add(Dense(20, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(15, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'team':
            model.add(Dense(14, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(9, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        elif focus == 'innovation':
            model.add(Dense(18, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(11, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        else:
            model.add(Dense(17, activation='relu', input_dim=input_dim, kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
            model.add(Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l2_penalty)))
            model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

    elif model_type == 'lstm':
        model.add(tf.keras.layers.Reshape((input_dim, 1), input_shape=(input_dim,)))
        model.add(LSTM(15, activation='relu', input_shape=(input_dim, 1)))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

    elif model_type == 'cnn':
        model.add(tf.keras.layers.Reshape((input_dim, 1), input_shape=(input_dim,)))
        model.add(Conv1D(13, 2, activation='relu', input_shape=(input_dim, 1), kernel_regularizer=regularizers.l2(l2_penalty)))
        model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

def build_random_forest():
    return RandomForestClassifier(n_estimators=20, random_state=42)

# Define different market niches (e.g., by category_list)
market_niches = df['category_list'].unique()

# Prepare a dictionary to store models for each market niche
niche_models = {}
for niche in market_niches:
    niche_indices = df['category_list'] == niche
    X_niche = X[niche_indices]
    y_niche = y[niche_indices]

    # Check if there are enough samples to split
    if len(X_niche) < 10:
        print(f"Skipping niche {niche} due to insufficient data.")
        continue

    # Split the data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_niche, y_niche, test_size=0.3, random_state=42)

    # Initialize models for each focus and type
    niche_models[niche] = {
        'mlp_financial': build_model(X_train.shape[1], focus='financial', model_type='mlp'),
        'mlp_market': build_model(X_train.shape[1], focus='market', model_type='mlp'),
        'lstm': build_model(X_train.shape[1], model_type='lstm'),
        'cnn': build_model(X_train.shape[1], model_type='cnn'),
        'random_forest': build_random_forest()
    }

    # Train each model
    for model_name, model in niche_models[niche].items():
        print(f"Training model {model_name} for niche {niche}...")
        if model_name == 'random_forest':
            model.fit(X_train, y_train)
        else:
            checkpoint_path = f"model_checkpoints/{niche}_{model_name}_checkpoint.weights.h5"
            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
            checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            history = model.fit(X_train, y_train, epochs=epoch, batch_size=30, validation_split=0.2, verbose=1, callbacks=[checkpoint, early_stopping])
            print(f"Model {model_name} training complete for niche {niche}.")
            print(f"  Final training loss: {history.history['loss'][-1]}")
            print(f"  Final validation loss: {history.history['val_loss'][-1]}")
            print(f"  Final training accuracy: {history.history['accuracy'][-1]}")
            print(f"  Final validation accuracy: {history.history['val_accuracy'][-1]}")

# Evaluate models on the test set for each niche
niche_performance = {}

for niche, models in niche_models.items():
    niche_indices = df['category_list'] == niche
    X_niche = X[niche_indices]
    y_niche = y[niche_indices]

    if len(X_niche) < 10:
        continue

    X_train, X_test, y_train, y_test = train_test_split(X_niche, y_niche, test_size=0.3, random_state=42)

    niche_performance[niche] = {}

    for model_name, model in models.items():
        if model_name == 'random_forest':
            y_pred = model.predict_proba(X_test)[:, 1]
            eval_result = [log_loss(y_test, y_pred), accuracy_score(y_test, model.predict(X_test))]
        else:
            eval_result = model.evaluate(X_test, y_test, verbose=0)

        niche_performance[niche][model_name] = eval_result

# Print performance for each niche
for niche, performance in niche_performance.items():
    print(f"Performance for niche {niche}:")
    for model_name, result in performance.items():
        print(f"  {model_name}: Loss = {result[0]}, Accuracy = {result[1]}")

# Compare specialized models with the baseline for each niche
best_models = {}

for niche, performance in niche_performance.items():
    best_model_name = min(performance, key=lambda k: performance[k][0])  # assuming lower loss is better
    best_models[niche] = best_model_name

    print(f"Best performing model for niche {niche}: {best_model_name} with performance: {performance[best_model_name]}")

# Save the best models
for niche, best_model_name in best_models.items():
    best_model = niche_models[niche][best_model_name]
    if best_model_name == 'random_forest':
        joblib.dump(best_model, f'best_model_{niche}.pkl')
    else:
        best_model.save(f'best_model_{niche}.h5')


The transformed input-values are:

 [[ 0.08915953 -0.18514222 -1.91104785 ... -0.69005976 -1.03768249
  -1.63025802]
 [ 1.41067941 -0.20808567  0.63247597 ...  0.05830927  1.17438633
   0.75892076]
 [-0.57160041 -0.19933445 -2.02163584 ... -1.30314011 -0.5881773
  -1.07881043]
 ...
 [-0.57160041 -0.17726573 -2.39026248 ... -0.69005976  0.60707189
   0.11642018]
 [-1.23236035 -0.19523734 -1.35810788 ...  1.7348614   1.04971441
   1.02310263]
 [-1.56274032 -0.20677441 -1.24751989 ...  1.03101776  0.14841646
   0.5870743 ]]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 5...
Epoch 1/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5563 - loss: 0.7568 - val_accuracy: 0.8455 - val_loss: 0.5754
Epoch 2/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7177 - loss: 0.6197 - val_accuracy: 0.8498 - val_loss: 0.4921
Epoch 3/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7809 - loss: 0.5472 - val_accuracy: 0.8541 - val_loss: 0.4516
Epoch 4/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8080 - loss: 0.5051 - val_accuracy: 0.8541 - val_loss: 0.4329
Epoch 5/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7762 - loss: 0.5430 - val_accuracy: 0.8541 - val_loss: 0.4230
Epoch 6/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8172 - loss: 0.5225 - val_accuracy: 0.8541 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4633 - loss: 0.8790 - val_accuracy: 0.9266 - val_loss: 0.4517
Epoch 2/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8537 - loss: 0.4796 - val_accuracy: 0.9266 - val_loss: 0.3273
Epoch 3/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9081 - loss: 0.3752 - val_accuracy: 0.9266 - val_loss: 0.2939
Epoch 4/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9220 - loss: 0.3631 - val_accuracy: 0.9266 - val_loss: 0.2821
Epoch 5/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9184 - loss: 0.3528 - val_accuracy: 0.9266 - val_loss: 0.2780
Epoch 6/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9265 - loss: 0.3315 - val_accuracy: 0.9266 - val_loss: 0.2745
Epoch 7/1000
[1m59/59[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6820 - loss: 0.6565 - val_accuracy: 0.9807 - val_loss: 0.3411
Epoch 2/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8968 - loss: 0.3881 - val_accuracy: 0.9807 - val_loss: 0.1978
Epoch 3/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9425 - loss: 0.2988 - val_accuracy: 0.9807 - val_loss: 0.1527
Epoch 4/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9494 - loss: 0.2874 - val_accuracy: 0.9807 - val_loss: 0.1387
Epoch 5/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9513 - loss: 0.2709 - val_accuracy: 0.9807 - val_loss: 0.1366
Epoch 6/1000
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9463 - loss: 0.2880 - val_accuracy: 0.9807 - val_loss: 0.1321
Epoch 7/1000
[1m56/56[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 1...
Epoch 1/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.7679 - loss: 0.5801 - val_accuracy: 0.9684 - val_loss: 0.4067
Epoch 2/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9049 - loss: 0.4309 - val_accuracy: 0.9684 - val_loss: 0.2888
Epoch 3/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9298 - loss: 0.3663 - val_accuracy: 0.9684 - val_loss: 0.2277
Epoch 4/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9423 - loss: 0.2941 - val_accuracy: 0.9684 - val_loss: 0.1970
Epoch 5/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9532 - loss: 0.2866 - val_accuracy: 0.9684 - val_loss: 0.1834
Epoch 6/1000
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9477 - loss: 0.3094 - val_accuracy: 0.9684 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 8...
Epoch 1/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6157 - loss: 0.7482 - val_accuracy: 0.8580 - val_loss: 0.5825
Epoch 2/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7571 - loss: 0.5819 - val_accuracy: 0.8977 - val_loss: 0.4720
Epoch 3/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8291 - loss: 0.4993 - val_accuracy: 0.8977 - val_loss: 0.4143
Epoch 4/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8499 - loss: 0.4426 - val_accuracy: 0.8977 - val_loss: 0.3833
Epoch 5/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9094 - loss: 0.3802 - val_accuracy: 0.8977 - val_loss: 0.3680
Epoch 6/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8828 - loss: 0.4066 - val_accuracy: 0.8977 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 6...
Epoch 1/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5337 - loss: 0.8000 - val_accuracy: 0.9085 - val_loss: 0.4897
Epoch 2/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8066 - loss: 0.5241 - val_accuracy: 0.9120 - val_loss: 0.3747
Epoch 3/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8963 - loss: 0.4211 - val_accuracy: 0.9120 - val_loss: 0.3370
Epoch 4/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8790 - loss: 0.4136 - val_accuracy: 0.9120 - val_loss: 0.3270
Epoch 5/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9023 - loss: 0.3724 - val_accuracy: 0.9120 - val_loss: 0.3241
Epoch 6/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9200 - loss: 0.3746 - val_accuracy: 0.9120 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 2...
Epoch 1/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.2236 - loss: 1.3119 - val_accuracy: 0.2535 - val_loss: 0.8401
Epoch 2/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3733 - loss: 1.0211 - val_accuracy: 0.8169 - val_loss: 0.6552
Epoch 3/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6585 - loss: 0.7166 - val_accuracy: 0.9296 - val_loss: 0.5492
Epoch 4/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7383 - loss: 0.6228 - val_accuracy: 0.9296 - val_loss: 0.4681
Epoch 5/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8021 - loss: 0.5125 - val_accuracy: 0.9296 - val_loss: 0.4114
Epoch 6/1000
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8630 - loss: 0.4491 - val_accuracy: 0.9296 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model mlp_financial for niche 0...
Epoch 1/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5194 - loss: 0.8110 - val_accuracy: 0.9018 - val_loss: 0.5114
Epoch 2/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7588 - loss: 0.5756 - val_accuracy: 0.9055 - val_loss: 0.3928
Epoch 3/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8640 - loss: 0.4637 - val_accuracy: 0.9055 - val_loss: 0.3487
Epoch 4/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8745 - loss: 0.4329 - val_accuracy: 0.9055 - val_loss: 0.3347
Epoch 5/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8957 - loss: 0.4199 - val_accuracy: 0.9055 - val_loss: 0.3260
Epoch 6/1000
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8823 - loss: 0.4421 - val_accuracy: 0.9055 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8543 - loss: 0.4717 - val_accuracy: 0.9539 - val_loss: 0.3509
Epoch 2/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9024 - loss: 0.4022 - val_accuracy: 0.9539 - val_loss: 0.2910
Epoch 3/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9272 - loss: 0.3455 - val_accuracy: 0.9539 - val_loss: 0.2571
Epoch 4/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9143 - loss: 0.3441 - val_accuracy: 0.9539 - val_loss: 0.2416
Epoch 5/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9112 - loss: 0.3330 - val_accuracy: 0.9539 - val_loss: 0.2318
Epoch 6/1000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9473 - loss: 0.2820 - val_accuracy: 0.9539 - val_loss: 0.2249
Epoch 7/1000
[1m21/21[0m [32m━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4230 - loss: 0.8937 - val_accuracy: 0.6328 - val_loss: 0.7063
Epoch 2/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4834 - loss: 0.8271 - val_accuracy: 0.8750 - val_loss: 0.5959
Epoch 3/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6403 - loss: 0.6667 - val_accuracy: 0.9375 - val_loss: 0.5181
Epoch 4/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7514 - loss: 0.5892 - val_accuracy: 0.9375 - val_loss: 0.4624
Epoch 5/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8083 - loss: 0.5349 - val_accuracy: 0.9375 - val_loss: 0.4173
Epoch 6/1000
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8552 - loss: 0.5013 - val_accuracy: 0.9375 - val_loss: 0.3787
Epoch 7/1000
[1m17/17[0m [32m━━━━━━━━



Performance for niche 5:
  mlp_financial: Loss = 0.3920454978942871, Accuracy = 0.8336673378944397
  mlp_market: Loss = 0.39028647541999817, Accuracy = 0.8336673378944397
  lstm: Loss = 0.38179877400398254, Accuracy = 0.8336673378944397
  cnn: Loss = 0.3797968626022339, Accuracy = 0.8336673378944397
  random_forest: Loss = 0.9972323522438671, Accuracy = 0.811623246492986
Performance for niche 9:
  mlp_financial: Loss = 0.2208152413368225, Accuracy = 0.936898410320282
  mlp_market: Loss = 0.22084541618824005, Accuracy = 0.936898410320282
  lstm: Loss = 0.21994584798812866, Accuracy = 0.936898410320282
  cnn: Loss = 0.2186799943447113, Accuracy = 0.936898410320282
  random_forest: Loss = 0.5052295267083335, Accuracy = 0.9251336898395722
Performance for niche 3:
  mlp_financial: Loss = 0.19348062574863434, Accuracy = 0.9527027010917664
  mlp_market: Loss = 0.19330738484859467, Accuracy = 0.9527027010917664
  lstm: Loss = 0.1951143741607666, Accuracy = 0.9527027010917664
  cnn: Loss = 0.18



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os

# Load dataset
df = pd.read_csv('./dataset_transformed.csv')

# Drop rows with missing values in critical columns
df.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns
label_encoders = {}
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df['status'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df['funding_total_usd'] = df['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X = df.drop(columns=['status'])
y = df['status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Define the MLP model
def build_mlp_model(input_dim):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_dim=input_dim , kernel_regularizer=regularizers.l2(l2_penalty)))
    model.add(Dropout(0.5))
    model.add(Dense(25, activation='relu' , kernel_regularizer=regularizers.l2(l2_penalty)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the MLP model
mlp_model = build_mlp_model(X_train.shape[1])

# Define callbacks for early stopping and model checkpointing
checkpoint_path = "./mlp_model_checkpoint.weights.h5"
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = mlp_model.fit(X_train, y_train, epochs=epoch, batch_size=32, validation_split=0.2, verbose=1, callbacks=[checkpoint, early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = mlp_model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Save the final model
mlp_model.save('mlp_model.h5')

print("MLP model training complete and saved.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7354 - loss: 0.5642 - val_accuracy: 0.9189 - val_loss: 0.2941
Epoch 2/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9197 - loss: 0.3242 - val_accuracy: 0.9189 - val_loss: 0.2862
Epoch 3/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9221 - loss: 0.3092 - val_accuracy: 0.9189 - val_loss: 0.2795
Epoch 4/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9247 - loss: 0.2865 - val_accuracy: 0.9189 - val_loss: 0.2748
Epoch 5/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9216 - loss: 0.2862 - val_accuracy: 0.9189 - val_loss: 0.2709
Epoch 6/1000
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9259 - loss: 0.2684 - val_accuracy: 0.9189 - val_loss: 0.2687
Epoch 7/1000
[1



Test Loss: 0.23782697319984436
Test Accuracy: 0.9277201294898987
MLP model training complete and saved.


In [6]:
df_software = df[df['category_list'] == label_encoders['category_list'].transform(['Apps'])[0]]

# Split the "Software" category data into training and test sets
df_train_software, df_test_software = train_test_split(df_software, test_size=0.3, random_state=56)

# Save the test dataset to a CSV file
test_software_file_path = './test_software_dataset.csv'
df_test_software.to_csv(test_software_file_path, index=False)

In [7]:
print("Evaluating :")
# Load the models
model_1 = tf.keras.models.load_model('./mlp_model.h5')
model_2 = tf.keras.models.load_model('./best_model_1.h5')

# Load label encoders and scaler
label_encoders = joblib.load('./label_encoders.pkl')
scaler = joblib.load('./scaler.pkl')

# Load test dataset
df_test = pd.read_csv('./test_software_dataset.csv')

# Drop rows with missing values in critical columns
df_test.dropna(subset=['status', 'funding_total_usd', 'country_code'], inplace=True)

# Encode categorical columns using the existing label encoders
for column in ['category_list', 'country_code', 'state_code', 'region', 'city', 'first_funding_at', 'last_funding_at', 'founded_at']:
    le = label_encoders[column]
    df_test[column] = df_test[column].astype(str)
    df_test[column] = df_test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Convert 'status' to binary classification (1 for operating, 0 for closed)
df_test['status'] = df_test['status'].apply(lambda x: 0 if x == 'closed' else 1)

# Handle 'funding_total_usd' column (convert '-' to 0 and convert to float)
df_test['funding_total_usd'] = df_test['funding_total_usd'].replace('-', 0).astype(float)

# Features and target
X_test = df_test.drop(columns=['status'])
y_test = df_test['status']

# Standardize the features
X_test = scaler.transform(X_test)

# Evaluate model_1
loss_1, accuracy_1 = model_1.evaluate(X_test, y_test, verbose=0)
print(f"General - Test Loss: {loss_1}, Test Accuracy: { 1 - loss_1}")

# Evaluate model_2
loss_2, accuracy_2 = model_2.evaluate(X_test, y_test, verbose=0)
print(f"Specialized - Test Loss: {loss_2}, Test Accuracy: {1 - loss_2}")




Evaluating :
General - Test Loss: 0.11649259924888611, Test Accuracy: 0.8835074007511139
Specialized - Test Loss: 0.02478424645960331, Test Accuracy: 0.9752157535403967
