In [14]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
from sklearn.utils import resample

In [15]:
# Load the dataset
df = pd.read_csv('supercleaned_dataset.csv')

In [16]:
# Step 1: Preprocessing
# Handle categorical features (e.g., 'country_code', 'city', 'market')
label_encoder = LabelEncoder()

categorical_cols = ['country_code', 'city', 'market']
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [17]:
# Handle class imbalance (90% operating, 10% closed/acquired)
df_operating = df[df['status'] == 'operating']
df_other = df[df['status'] != 'operating']

df_operating_downsampled = resample(df_operating, 
                                    replace=False, 
                                    n_samples=len(df_other), 
                                    random_state=42)

df_balanced = pd.concat([df_operating_downsampled, df_other])

In [43]:
# Encode the target variable 'status'
df_balanced['status'] = label_encoder.fit_transform(df_balanced['status'])

# Define features (X) and target (y)
X = df_balanced.drop(columns=['status', 'name', 'founded_at', 'first_funding_at', 'last_funding_at', 'founded_month', 'founded_year', 'round_c', 'round_d', 'round_e', 'round_f', 'round_g', 'round_h','country_code'])
y = df_balanced['status']

In [44]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [69]:
# Step 2: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Logistic Regression Model
log_reg = LogisticRegression(multi_class='ovr', solver='liblinear')

# Fit logistic regression to the data
log_reg.fit(X_train_scaled, y_train)

In [59]:
# Get feature importance based on coefficients
importance = np.abs(log_reg.coef_[0])

# Create a dataframe to show feature names with their importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

In [70]:
# Sort the features by their importance (highest first)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Ensure 'funding_total_usd' is always included in the top features
if 'funding_total_usd' not in feature_importance_df['Feature'].values:
    funding_total_importance = feature_importance_df[feature_importance_df['Feature'] == 'funding_total_usd']['Importance'].values[0]
    feature_importance_df = feature_importance_df.append({
        'Feature': 'funding_total_usd', 
        'Importance': funding_total_importance
    }, ignore_index=True)

# Select top 9 most important features excluding 'funding_total_usd'
top_14_features = feature_importance_df[feature_importance_df['Feature'] != 'funding_total_usd'].head(14)
top_15_features = pd.concat([top_14_features, feature_importance_df[feature_importance_df['Feature'] == 'funding_total_usd']])

print("Top 15 most relevant features (including 'funding_total_usd'):\n", top_15_features)


Top 15 most relevant features (including 'funding_total_usd'):
                  Feature  Importance
5                venture    0.453442
0                 market    0.125524
17               round_a    0.125261
16  product_crowdfunding    0.099873
13       post_ipo_equity    0.092927
18               round_b    0.080203
15      secondary_market    0.059609
14         post_ipo_debt    0.053819
7            undisclosed    0.051534
6    equity_crowdfunding    0.048931
11                 grant    0.045236
12        private_equity    0.041053
8       convertible_note    0.031690
3         funding_rounds    0.030654
1      funding_total_usd    0.001453


In [71]:
# Filter X_train and X_test to use only the top 15 most relevant features
top_15_feature_names = top_15_features['Feature'].values
X_train_top_15 = X_train_scaled[:, [X.columns.get_loc(col) for col in top_15_feature_names]]
X_test_top_15 = X_test_scaled[:, [X.columns.get_loc(col) for col in top_15_feature_names]]

In [72]:
# Step 4: Limited Hyperparameter Tuning
# Only tune regularization type and strength
param_dist = {
    'penalty': ['l1', 'l2'],   # L1 and L2 regularization
    'C': np.logspace(-3, 3, 7)  # Regularization strength between 0.001 and 100
}

random_search = RandomizedSearchCV(log_reg, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=5, verbose=1, random_state=42)
random_search.fit(X_train_top_15, y_train)

# Best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best Hyperparameters: {'penalty': 'l2', 'C': 10.0}


In [73]:
from sklearn.model_selection import cross_val_score


# Step 5: Cross-validation score with top 10 features
cv_scores = cross_val_score(random_search.best_estimator_, X_train_top_15, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {np.mean(cv_scores):.4f}")

Cross-validation Accuracy: 0.5279


In [74]:
# Step 6: Evaluate on test set with top 15 features
y_pred = random_search.best_estimator_.predict(X_test_top_15)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.22      0.31       584
           1       0.00      0.00      0.00       481
           2       0.53      0.93      0.67      1076

    accuracy                           0.53      2141
   macro avg       0.35      0.38      0.33      2141
weighted avg       0.41      0.53      0.42      2141

Test Accuracy: 0.5273
Test Precision: 0.4087
Test Recall: 0.5273
Test F1 Score: 0.4227


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
df.columns

Index(['name', 'market', 'funding_total_usd', 'status', 'country_code', 'city',
       'funding_rounds', 'founded_at', 'founded_month', 'founded_year',
       'first_funding_at', 'last_funding_at', 'seed', 'venture',
       'equity_crowdfunding', 'undisclosed', 'convertible_note',
       'debt_financing', 'angel', 'grant', 'private_equity', 'post_ipo_equity',
       'post_ipo_debt', 'secondary_market', 'product_crowdfunding', 'round_a',
       'round_b', 'round_c', 'round_d', 'round_e', 'round_f', 'round_g',
       'round_h'],
      dtype='object')