In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
anatel_file_path = '../../data/labeled_csv_files/Anatel_labeled.csv'
anatel = pd.read_csv(anatel_file_path)

### Encoding

In [None]:
# One-Hot Encoding
anatel = pd.get_dummies(anatel, columns=['Polarization'], prefix='Polarization')
anatel = pd.get_dummies(anatel, columns=['BasicFeatures'], prefix='BasicFeatures')
# Decision trees and random forests can handle boolean variables without encoding. They naturally make binary decisions based on the values of the features.

### Removing highly correlated columns

In [None]:
anatel_numeric = anatel.drop("SiteType", axis=1)
anatel_numeric.corr()
anatel = anatel.drop(columns = ["Polarization_V", "MinTxFreq", "MaxRxFreq", "BasicFeatures_G7W"])

### Running Random Forest

In [None]:
n_estimators = 100 #The number of trees in the forest.
criterion = 'gini'
random_state = 42
test_size = 0.2
max_depth = None
min_samples_split = 2
min_samples_leaf = 1

In [None]:
X = Anatel.drop('SiteType', axis=1)
y = Anatel['SiteType']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

rf_classifier = RandomForestClassifier(n_estimators = n_estimators, random_state = random_state, criterion = criterion,
                                       max_depth = max_depth, min_samples_split = min_samples_split, 
                                       min_samples_leaf = min_samples_leaf)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


### Visualizing Results

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Classification Report:\n{cr}")

In [None]:
print(f"Confusion matrix: {cm}")