In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
import joblib

# Load your dataset
df = pd.read_csv('supercleaned_dataset.csv')

# Define features and target
features = ['funding_total_usd', 'funding_rounds', 'seed', 'venture', 
            'equity_crowdfunding', 'undisclosed', 'convertible_note', 
            'debt_financing', 'angel', 'grant', 'private_equity', 
            'post_ipo_equity', 'post_ipo_debt', 'secondary_market', 
            'product_crowdfunding', 'round_a', 'round_b', 'round_c', 
            'round_d', 'round_e', 'round_f', 'round_g', 'round_h']

X = df[features]
y = df['status']  # Target column 'status'


In [2]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize the data
    ('pca', PCA(n_components=15)), # Apply PCA (top 15 components)
    ('svm', SVC(probability=True)) # SVM classifier with probability outputs
])

# Define a more focused hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],          # Regularization parameter
    'svm__gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf' kernel
    # Removed 'linear' kernel to focus only on 'rbf' for non-linear relationships
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(f"Best Parameters: {grid_search.best_params_}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'svm__C': 0.1, 'svm__gamma': 'scale'}


In [4]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)  # Use grid_search instead of random_search

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision, Recall, and F1 Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Classification Report
print(classification_report(y_test, y_pred))

# Save the model using joblib
joblib.dump(grid_search.best_estimator_, 'svm_pca_model_grid_search.pkl')  # Use grid_search instead of random_search
print("Model saved successfully!")

Accuracy: 0.8749
Precision: 0.7654
Recall: 0.8749
F1 Score: 0.8165


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    acquired       0.00      0.00      0.00       591
      closed       0.00      0.00      0.00       479
   operating       0.87      1.00      0.93      7480

    accuracy                           0.87      8550
   macro avg       0.29      0.33      0.31      8550
weighted avg       0.77      0.87      0.82      8550

Model saved successfully!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
