In [1]:
from decision_tree import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
anatel_file_path = '../../data/labeled_csv_files/Anatel_labeled.csv'
anatel = pd.read_csv(anatel_file_path)

### Encoding

In [None]:
# One-Hot Encoding
anatel = pd.get_dummies(anatel, columns=['Polarization'], prefix='Polarization')
anatel = pd.get_dummies(anatel, columns=['BasicFeatures'], prefix='BasicFeatures')
# Decision trees and random forests can handle boolean variables without encoding. They naturally make binary decisions based on the values of the features.

### Visualizing Correlation

In [12]:
anatel_numeric = anatel.drop("SiteType", axis=1)
anatel_numeric.corr()

Unnamed: 0,MinTxFreq,MaxTxFreq,MinRxFreq,MaxRxFreq,AntennaCode,AntennaGain,FrontBackAntennaRation,AnguloMeiaPotenciaAntena_max,ElevationAngle,AntennaHeight,...,BasicFeatures_0G9,BasicFeatures_7W,BasicFeatures_D7D,BasicFeatures_D7W,BasicFeatures_D9W,BasicFeatures_F8W,BasicFeatures_G7E,BasicFeatures_G7W,BasicFeatures_G9W,BasicFeatures_M7W
MinTxFreq,1.0,-0.077319,0.988518,-0.073402,-0.311026,-0.063921,-0.148236,-0.025445,0.135918,-0.172737,...,0.058076,0.008387,0.008486,0.051899,-0.001604,-0.001747,0.009729,0.037864,-0.082282,0.021224
MaxTxFreq,-0.077319,1.0,-0.101245,0.99426,0.162209,0.104266,-0.025282,0.019249,-0.423551,-0.14372,...,0.001593,-0.005595,-0.005508,-0.000809,-0.007237,0.000406,-0.009481,0.008442,0.004685,-0.059934
MinRxFreq,0.988518,-0.101245,1.0,-0.093342,-0.308471,-0.052718,-0.12629,-0.034758,0.16574,-0.159426,...,0.060202,0.008042,0.009065,0.046525,-0.001772,-0.001785,0.010544,0.041391,-0.084424,0.026107
MaxRxFreq,-0.073402,0.99426,-0.093342,1.0,0.150718,0.104164,-0.019031,0.020159,-0.409414,-0.14268,...,0.002965,-0.005505,-0.004807,-0.0097,-0.010056,9.9e-05,-0.008316,0.022995,-0.007824,-0.05389
AntennaCode,-0.311026,0.162209,-0.308471,0.150718,1.0,0.351112,0.410276,-0.296524,-0.114606,0.200584,...,0.003528,-0.023594,-0.014144,0.018211,0.006049,0.003305,-0.015788,-0.048247,0.050199,-0.023521
AntennaGain,-0.063921,0.104266,-0.052718,0.104164,0.351112,1.0,0.582658,-0.228428,0.013753,0.27615,...,-0.009328,-0.017616,0.004253,-0.015879,0.000228,0.002594,0.002578,-0.055939,0.073099,0.011378
FrontBackAntennaRation,-0.148236,-0.025282,-0.12629,-0.019031,0.410276,0.582658,1.0,-0.199775,0.056262,0.344889,...,0.001458,-0.021523,0.00232,0.011824,0.01142,0.00073,-0.00057,-0.049757,0.04081,0.041138
AnguloMeiaPotenciaAntena_max,-0.025445,0.019249,-0.034758,0.020159,-0.296524,-0.228428,-0.199775,1.0,-0.023335,-0.091085,...,-0.002934,-0.009784,-0.000829,0.012994,-0.004154,-0.000562,0.000339,-0.021259,0.018318,0.000266
ElevationAngle,0.135918,-0.423551,0.16574,-0.409414,-0.114606,0.013753,0.056262,-0.023335,1.0,0.079263,...,0.023519,-0.000713,0.014563,0.00323,0.006182,0.001394,0.011538,-0.118603,0.122981,0.052012
AntennaHeight,-0.172737,-0.14372,-0.159426,-0.14268,0.200584,0.27615,0.344889,-0.091085,0.079263,1.0,...,0.007438,-0.010459,0.005521,-0.017316,-0.008019,0.004506,-0.000835,-0.067909,0.084354,0.035447


In [None]:
# Proportion of the dataset to include in the test split
test_size = 0.2
# Controls the shuffling applied to the data before applying the split (pass int for reproducible output across multiple function 
# calls)
random_state = 42
# The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “log_loss” and “entropy” 
# both for the Shannon information gain
criterion = 'gini'
# The maximum depth of the tree. i=If None, then nodes are expanded until all leaves are pure or until all leaves contain less than 
# min_samples_split samples
max_depth = None
# The minimum number of samples required to split an internal node
min_samples_split = 2
# The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at 
# least min_samples_leaf training samples in each of the left and right branches.
min_samples_leaf = 1

In [3]:
# Split data into features and target
X = anatel.drop("SiteType", axis=1)
y = anatel["SiteType"]

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Build and Train the Decision Tree Model
tree_model = DecisionTreeClassifier(random_state=random_state, criterion=criterion, max_depth=max_depth, 
                                    min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf)
tree_model.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = tree_model.predict(X_test)

In [4]:
accuracy = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [5]:
cm = confusion_matrix(y_test, y_pred)

In [6]:
feature_importances = tree_model.feature_importances_

In [None]:
scores = cross_val_score(tree_model, X, y, cv=5)
print("Cross-Validation Scores:", scores)


In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(tree_model, param_grid, cv=5)
grid_search.fit(X, y)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


### Visualizing Results

In [None]:
print(f"Accuracy: {accuracy}")

print(F"\nClassification Report: \n {cr}")

class_names = tree_model.classes_
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [9]:
print("Feature Importances:")
feature_names = X.columns
for i, importance in enumerate(feature_importances):
    print(f"{feature_names[i]}: {importance:.4f}")


Feature Importances:
MinTxFreq: 0.0714
MaxTxFreq: 0.0098
MinRxFreq: 0.0199
MaxRxFreq: 0.0888
AntennaCode: 0.0086
AntennaGain: 0.0709
FrontBackAntennaRation: 0.0341
AnguloMeiaPotenciaAntena_max: 0.0561
ElevationAngle: 0.0613
AntennaHeight: 0.1909
TransmitterPower: 0.0380
NecessaryBandwidth: 0.0313
LTE: 0.0008
WCDMA: 0.0041
GSM: 0.0081
NR_NSA: 0.0000
NR_SA-NSA: 0.0020
DMR: 0.0000
Digital: 0.0000
DaysSinceLicensing: 0.0838
DaysSinceFirstLicensing: 0.1496
DaysUntilExpiration: 0.0525
Polarization_V: 0.0001
Polarization_X: 0.0107
BasicFeatures_0G7: 0.0000
BasicFeatures_0G9: 0.0001
BasicFeatures_7W: 0.0000
BasicFeatures_D7D: 0.0000
BasicFeatures_D7W: 0.0009
BasicFeatures_D9W: 0.0005
BasicFeatures_F8W: 0.0000
BasicFeatures_G7E: 0.0000
BasicFeatures_G7W: 0.0034
BasicFeatures_G9W: 0.0020
BasicFeatures_M7W: 0.0003
