In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the dataset
df = pd.read_csv('supercleaned_dataset_ipo.csv')
df.head()

Unnamed: 0,name,market,funding_total_usd,status,country_code,city,funding_rounds,founded_at,founded_month,founded_year,...,product_crowdfunding,round_a,round_b,round_c,round_d,round_e,round_f,round_g,round_h,ipo_status
0,waywire,News,1750000.0,acquired,USA,New York,1.0,2012-06-01,6,2012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pre_ipo
1,&TV Communications,Games,4000000.0,operating,USA,Los Angeles,2.0,2007-04-24,3,2007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,likely_pre_ipo
2,'Rock' Your Paper,Publishing,40000.0,operating,EST,Tallinn,1.0,2012-10-26,10,2012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pre_ipo
3,(In)Touch Network,Electronics,1500000.0,operating,GBR,London,1.0,2011-04-01,4,2011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pre_ipo
4,-R- Ranch and Mine,Tourism,60000.0,operating,USA,Fort Worth,2.0,2014-01-01,1,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pre_ipo


In [26]:
# Keep only relevant columns including funding_rounds
relevant_columns = [
    'market', 
    'funding_total_usd', 
    'funding_rounds',  # Added funding_rounds column
    'round_a', 
    'round_b', 
    'round_c', 
    'round_d', 
    'round_e', 
    'round_f', 
    'round_g', 
    'round_h', 
    'venture', 
    'private_equity', 
    'post_ipo_equity', 
    'post_ipo_debt', 
    'ipo_status'
]

# Filter the DataFrame to include only relevant columns
df = df[relevant_columns]

In [27]:
# Preprocessing
# Encoding categorical features
label_encoders = {}
categorical_cols = ['market', 'ipo_status']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    # Save the label encoder
    joblib.dump(le, f'label_encoder_{col}.pkl')

# Features and Target Variable
X = df.drop(columns=['ipo_status'])
y = df['ipo_status']

In [28]:
# Features and Target Variable
X = df.drop(columns=['ipo_status'])
y = df['ipo_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100],  
    'max_depth': [None, 5, 10, 15],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['auto', 'sqrt']  
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Save the best model
best_model = grid_search.best_estimator_
# Print the best hyperparameters found by GridSearchCV
best_hyperparameters = grid_search.best_params_
print(f"Best Hyperparameters: {best_hyperparameters}")
joblib.dump(best_model, 'random_forest_model.pkl')

360 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Ankit\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Ankit\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\Ankit\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Ankit\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

Best Hyperparameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


['random_forest_model.pkl']

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='macro')
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

# Print evaluation metrics
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Optional: Validation score from cross-validation
validation_score = grid_search.best_score_
print(f"Validation Score (Cross-Validation F1 Score): {validation_score:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.9998
Precision: 0.9998
Recall: 0.9867
F1 Score: 0.9931
Validation Score (Cross-Validation F1 Score): 0.9911


In [29]:
# Evaluate the model
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='macro')
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

# Print evaluation metrics
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Optional: Validation score from cross-validation
validation_score = grid_search.best_score_
print(f"Validation Score (Cross-Validation F1 Score): {validation_score:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.9998
Precision: 0.9998
Recall: 0.9867
F1 Score: 0.9931
Validation Score (Cross-Validation F1 Score): 0.9911


In [30]:
print(df['ipo_status'].value_counts())


ipo_status
2    23750
0    18750
1      247
Name: count, dtype: int64
