In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

X = pd.read_csv('/Users/brianmewhinney/dev/tradewinds-python/data/8/x.csv').values
y = pd.read_csv('/Users/brianmewhinney/dev/tradewinds-python/data/8/y.csv').values.flatten()  # Flatten to 1D array
#X = pd.read_csv('/Users/brianmewhinney/dev/tradewinds-python/data/9/x.csv').values
#y = pd.read_csv('/Users/brianmewhinney/dev/tradewinds-python/data/9/y.csv').values.flatten()  # Flatten to 1D array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

scaler = StandardScaler()
#scaler = MinMaxScaler()

X_resampled = scaler.fit_transform(X_train)

param_grid = {
    'n_estimators': [500, 750, 1000, 1500],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', None]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_resampled, y_train)  # Use the original y_train

print("Best Parameters:", grid_search.best_params_)
#print("Best Score:", grid_search.best_score_)
print(f'Best Score: {grid_search.best_score_:.6f}')
best_model = grid_search.best_estimator_
best_model.fit(X_resampled, y_train)

y_pred = best_model.predict(scaler.transform(X_test))  # Standardize X_test
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.5f}')

importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_resampled.shape[1]):
    print(f"{f + 1}. Feature {indices[f]} ({importances[indices[f]]:.4f})")

result = permutation_importance(best_model, scaler.transform(X_test), y_test, n_repeats=30, random_state=42, n_jobs=-1)

# Display permutation importance
sorted_idx = result.importances_mean.argsort()[::-1]
print("Permutation feature importance:")
for idx in sorted_idx:
    print(f"Feature {idx}: {result.importances_mean[idx]:.4f}")

Training set size: 1356
Test set size: 339
Best Parameters: {'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 1000}
Best Score: 0.561200
Test Accuracy: 0.58407
Feature ranking:
1. Feature 0 (0.1107)
2. Feature 1 (0.1084)
3. Feature 2 (0.0402)
4. Feature 17 (0.0357)
5. Feature 19 (0.0347)
6. Feature 27 (0.0275)
7. Feature 28 (0.0265)
8. Feature 23 (0.0250)
9. Feature 24 (0.0232)
10. Feature 25 (0.0231)
11. Feature 13 (0.0224)
12. Feature 15 (0.0223)
13. Feature 22 (0.0215)
14. Feature 26 (0.0214)
15. Feature 18 (0.0213)
16. Feature 45 (0.0209)
17. Feature 21 (0.0208)
18. Feature 20 (0.0201)
19. Feature 37 (0.0201)
20. Feature 14 (0.0186)
21. Feature 43 (0.0182)
22. Feature 47 (0.0170)
23. Feature 39 (0.0168)
24. Feature 41 (0.0167)
25. Feature 12 (0.0165)
26. Feature 16 (0.0159)
27. Feature 11 (0.0152)
28. Feature 33 (0.0147)
29. Feature 42 (0.0140)
30. Feature 35 (0.0135)
31. Feature 31 (0.0127)
32. Feature 4