In [10]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load your dataset
df = pd.read_csv('df_sorted.csv')  # Change this to the path of your dataset

# Drop date columns
df.drop(columns=['founded_at', 'first_funding_at', 'last_funding_at', 'founded_year'], inplace=True)

# Select features for PCA (excluding target variable 'status', 'name', 'market', 'country_code', and 'city')
features = df.drop(columns=['status', 'name', 'market', 'country_code', 'city'])
X = features.values
y = df['status']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame for PCA results and get variance ratios
pca_df = pd.DataFrame(data=X_pca)
variance_ratios = pca.explained_variance_ratio_

# Print explained variance ratios
print("Explained Variance Ratios for each component:")
for i, ratio in enumerate(variance_ratios):
    print(f"Component {i+1}: {ratio:.4f}")

# Selecting top 15 features based on explained variance
X_top15 = pca_df.iloc[:, :15]

Explained Variance Ratios for each component:
Component 1: 0.1444
Component 2: 0.0853
Component 3: 0.0800
Component 4: 0.0552
Component 5: 0.0485
Component 6: 0.0481
Component 7: 0.0453
Component 8: 0.0442
Component 9: 0.0437
Component 10: 0.0436
Component 11: 0.0435
Component 12: 0.0430
Component 13: 0.0422
Component 14: 0.0412
Component 15: 0.0383
Component 16: 0.0347
Component 17: 0.0321
Component 18: 0.0295
Component 19: 0.0253
Component 20: 0.0236
Component 21: 0.0053
Component 22: 0.0030
Component 23: 0.0000


In [12]:
#  Split the data into training and testing sets
from sklearn.model_selection import RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X_top15, y, test_size=0.2, random_state=42)

# Set up a reduced hyperparameter grid for Random Forest
param_dist = {
    'n_estimators': [100, 200],  # Fewer options
    'max_depth': [None, 10],      # Fewer options
    'min_samples_split': [2, 5],  # Fewer options
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=10, scoring='f1', cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters
best_params = random_search.best_params_
print("Best Parameters: ", best_params)



Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best Parameters:  {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}


In [13]:
# Train the model with best parameters
best_rf = random_search.best_estimator_

# Cross-validation
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=3)

# Train on the entire training set
best_rf.fit(X_train, y_train)

# Predictions
y_pred = best_rf.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics including test accuracy
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean():.4f}")
print(f"Test Accuracy: {accuracy:.4f}")  # Test accuracy
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Cross-Validation Scores: [0.93764151 0.93773585 0.93632075]
Mean Cross-Validation Score: 0.9372
Test Accuracy: 0.9357
F1 Score: 0.9149
Precision: 0.8960
Recall: 0.9357
