# Random Forest

In [46]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt

In [47]:
data = pd.read_csv('aoe_data_clean.csv')
data.head()


Unnamed: 0,map,duration,elo,p1_civ,p2_civ,winner
0,3,3445,1104.0,38,25,0
1,5,2932,884.5,3,13,0
2,5,2573,905.5,9,24,0
3,3,851,1080.0,25,22,1
4,3,4737,1050.0,1,32,1


In [48]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [50]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [51]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

classifier = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [52]:
# Make predictions on the test set
y_pred = classifier.predict(X_test)
confusion_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Best parameters:", grid_search.best_params_)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Get feature importances
importance_scores = classifier.feature_importances_
sorted_indices = importance_scores.argsort()[::-1]
top_five_indices = sorted_indices[:5]
top_five_variables = data.columns[top_five_indices]
print("Top five indices:\n", sorted_indices)
print("Top five variables:\n", top_five_variables)

In [None]:
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(importance_scores)), importance_scores[sorted_indices])
plt.xticks(range(len(importance_scores)), data.columns[sorted_indices], rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Scores')
plt.title('Feature Importances')
plt.tight_layout()
plt.show()