In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the dataset
df = pd.read_csv('supercleaned_dataset_ipo.csv')

In [2]:
relevant_columns = [
    'market', 
    'funding_total_usd', 
    'funding_rounds',  # Added funding_rounds column
    'round_a', 
    'round_b', 
    'round_c', 
    'round_d', 
    'round_e', 
    'round_f', 
    'round_g', 
    'round_h', 
    'venture', 
    'private_equity', 
    'post_ipo_equity', 
    'post_ipo_debt', 
    'ipo_status'
]

# Filter the DataFrame to include only relevant columns
df = df[relevant_columns]

In [3]:
# Preprocessing
# Encoding categorical features
label_encoders = {}
categorical_cols = ['market', 'ipo_status']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and Target Variable
X = df.drop(columns=['ipo_status'])
y = df['ipo_status']


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],  # Splitting criteria
    'max_depth': [None, 5, 10, 15, 20],  # Control tree size
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples at a leaf node
}

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
print(f"Best Parameters: {grid_search.best_params_}")

Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [5]:
# Evaluate the model with cross-validation
best_model = grid_search.best_estimator_

# Cross-validation score on the training set
validation_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1_macro')
validation_score_mean = validation_scores.mean()

# Predict on training set and test set
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Evaluation Metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluation Metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Precision, Recall, F1 Score for test set
report = classification_report(y_test, y_test_pred, output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Print Validation Score
print(f"Validation Score (Cross-Validated F1 Score): {validation_score_mean:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.9999
Test Precision: 0.9999
Test Recall: 0.9999
Test F1 Score: 0.9999
Validation Score (Cross-Validated F1 Score): 0.9983
