In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib as plt

In [None]:
anatel_file_path = '../../data/labeled_csv_files/Anatel_labeled.csv'
anatel = pd.read_csv(anatel_file_path)

### Encoding

In [None]:
# One-Hot Encoding
anatel = pd.get_dummies(anatel, columns=['Polarization'], prefix='Polarization')
anatel = pd.get_dummies(anatel, columns=['BasicFeatures'], prefix='BasicFeatures')
# Decision trees and random forests can handle boolean variables without encoding. They naturally make binary decisions based on the values of the features.

### Removing highly correlated columns

In [None]:
anatel_numeric = anatel.drop("SiteType", axis=1)
anatel_numeric.corr()
anatel = anatel.drop(columns = ["Polarization_V", "MinTxFreq", "MaxRxFreq", "BasicFeatures_G7W"])

### Removing columns of too little feature importance

In [None]:
# Must have created the feature_importance variable previously
features_of_little_importance = []
for i, importance in enumerate(feature_importances):
    if importance < 0.1:
        features_of_little_importance.append(feature_names[i])
anatel = anatel.drop(columns = features_of_little_importance)

### Running Random Forest

In [None]:
n_estimators = 100 #The number of trees in the forest.
criterion = 'gini'
random_state = 42
test_size = 0.2
max_depth = 3
min_samples_split = 2
min_samples_leaf = 1

In [None]:
X = anatel.drop('SiteType', axis=1)
y = anatel['SiteType']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

rf_classifier = RandomForestClassifier(n_estimators = n_estimators, random_state = random_state, criterion = criterion,
                                       max_depth = max_depth, min_samples_split = min_samples_split, 
                                       min_samples_leaf = min_samples_leaf)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


In [None]:
feature_importances = rf_classifier.feature_importances_

In [21]:
scores = cross_val_score(rf_classifier, X, y)

### Visualizing Results

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Classification Report:\n{cr}")

In [None]:
# Confusion Matrix
class_names = rf_classifier.classes_
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print("Feature Importances:")
feature_names = X.columns
for i, importance in enumerate(feature_importances):
    print(f"{feature_names[i]}: {importance:.4f}")

In [22]:
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation of Accuracy:", scores.std())

Cross-Validation Scores: [0.79112889 0.80365421 0.7916553  0.78474684 0.78947368]
Mean Accuracy: 0.7921317856761085
Standard Deviation of Accuracy: 0.006254631704901671
