In [1]:
from decision_tree import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Proportion of the dataset to include in the test split
test_size = 0.2
# Controls the shuffling applied to the data before applying the split (pass int for reproducible output across multiple function 
# calls)
random_state = 42
# The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “log_loss” and “entropy” 
# both for the Shannon information gain
criterion = 'gini'
# The maximum depth of the tree. i=If None, then nodes are expanded until all leaves are pure or until all leaves contain less than 
# min_samples_split samples
max_depth = None
# The minimum number of samples required to split an internal node
min_samples_split = 2
# The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at 
# least min_samples_leaf training samples in each of the left and right branches.
min_samples_leaf = 1

In [3]:
anatel_file_path = '../../data/labeled_csv_files/Anatel_labeled.csv'

anatel = pd.read_csv(anatel_file_path)

# One-Hot Encoding
anatel = pd.get_dummies(anatel, columns=['Polarization'], prefix='Polarization')
anatel = pd.get_dummies(anatel, columns=['BasicFeatures'], prefix='BasicFeatures')
# Decision trees and random forests can handle boolean variables without encoding. They naturally make binary decisions based on the values of the features.

# Split data into features and target
X = anatel.drop("SiteType", axis=1)
y = anatel["SiteType"]

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Build and Train the Decision Tree Model
tree_model = DecisionTreeClassifier(random_state=random_state, criterion=criterion, max_depth=max_depth, 
                                    min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf)
tree_model.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = tree_model.predict(X_test)

In [4]:
accuracy = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [5]:
cm = confusion_matrix(y_test, y_pred)

In [6]:
feature_importances = tree_model.feature_importances_

In [None]:
scores = cross_val_score(tree_model, X, y, cv=5)
print("Cross-Validation Scores:", scores)


In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(tree_model, param_grid, cv=5)
grid_search.fit(X, y)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


### Visualizing Results

In [None]:
print(f"Accuracy: {accuracy}")

print(F"\nClassification Report: \n {cr}")

class_names = tree_model.classes_
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [9]:
print("Feature Importances:")
feature_names = X.columns
for i, importance in enumerate(feature_importances):
    print(f"{feature_names[i]}: {importance:.4f}")


Feature Importances:
MinTxFreq: 0.0714
MaxTxFreq: 0.0098
MinRxFreq: 0.0199
MaxRxFreq: 0.0888
AntennaCode: 0.0086
AntennaGain: 0.0709
FrontBackAntennaRation: 0.0341
AnguloMeiaPotenciaAntena_max: 0.0561
ElevationAngle: 0.0613
AntennaHeight: 0.1909
TransmitterPower: 0.0380
NecessaryBandwidth: 0.0313
LTE: 0.0008
WCDMA: 0.0041
GSM: 0.0081
NR_NSA: 0.0000
NR_SA-NSA: 0.0020
DMR: 0.0000
Digital: 0.0000
DaysSinceLicensing: 0.0838
DaysSinceFirstLicensing: 0.1496
DaysUntilExpiration: 0.0525
Polarization_V: 0.0001
Polarization_X: 0.0107
BasicFeatures_0G7: 0.0000
BasicFeatures_0G9: 0.0001
BasicFeatures_7W: 0.0000
BasicFeatures_D7D: 0.0000
BasicFeatures_D7W: 0.0009
BasicFeatures_D9W: 0.0005
BasicFeatures_F8W: 0.0000
BasicFeatures_G7E: 0.0000
BasicFeatures_G7W: 0.0034
BasicFeatures_G9W: 0.0020
BasicFeatures_M7W: 0.0003
