# Machine Learning Enhanced localization in 5G networks    

# Data preparation

## Data loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import load_dataframe

# load the coverage and cir datasets
n_datapoints = 100000
df = load_dataframe(f'data/5G_cov_cir_C1_interpolated_{n_datapoints}.csv')

df

## Data splitting
The dataset is split between a training set and a testing set.
We use 80% of the data for training the models and the remaining 20% for testing.

In [None]:
from sklearn.model_selection import train_test_split
Y = df[['latitude', 'longitude']]

X = df.drop(['latitude', 'longitude'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

# Model building

## Linear regression

I'm using Linear regression, since that is the simplest ML algorithm and is a low hanging fruit to test.
The potential issue with using this is that it assumes a linear relationship between input and output.
I'm yet not sure if that is the case for the signal data, but it poses a potential challenge.

### Training the model 

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

### Applying model for prediction

In [None]:
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

### Performance evaluation

In [None]:
from utils import mean_position_error
# training data performance

lr_train_mpe = mean_position_error(y_train, y_lr_train_pred)
lr_test_mpe = mean_position_error(y_test, y_lr_test_pred)
errors_lr = pd.DataFrame([["Linear Regression", lr_train_mpe, lr_test_mpe]], columns=['Model', 'Train MPE', 'Test MPE'])

print("Performance evaluation: Linear Regression")
print("Training Mpe:", lr_train_mpe)
print("Test Mpe:", lr_test_mpe)

## K-NN
The KNN algorithm is different from Linear Regression in the way that it remembers the dataset instead of learning a model.
It will try to find the n closest neighbours and predict the label based on them.
This has more options in terms of parameters we can adjust so we run a grid search in order to determine the best parameters.
But is the dataset suited for this kind of model? Idk.

### Training the model

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Sample hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'], 
    'p': [1, 2]
}

# Initialize the KNN regressor
knn = KNeighborsRegressor()

# Grid search for the best parameters
grid_search = GridSearchCV(knn, param_grid, cv=5)  # cv is the number of folds for cross-validation
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_knn_model = grid_search.best_estimator_

print("Best parameters:", best_params)
print("Best model:", best_knn_model)

# fit the model
best_knn_model.fit(X_train, y_train)
knn = best_knn_model

### Applying the model for prediction

In [None]:
y_knn_train_pred = knn.predict(X_train)

y_knn_test_pred = knn.predict(X_test)

### Performance evaulation

In [None]:
from utils import mean_position_error
knn_train_mpe = mean_position_error(y_train, y_knn_train_pred)
knn_test_mpe = mean_position_error(y_test, y_knn_test_pred)
errors_knn = pd.DataFrame([["KNN",knn_train_mpe, knn_test_mpe]], columns= ['Model','Train MPE', 'Test MPE'])

print("Performance evaluation: KNN")
print("Training Mpe:", knn_train_mpe)
print("Test Mpe:", knn_test_mpe)

## Random Forest
Uses a desicion tree strategy for regression. Should be scalable for larger datasets. 

### Training the model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define your model
rf = RandomForestRegressor()

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt']  # Number of features to consider at every split
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)
print("Best model:", best_rf)

# Use the best estimator for further predictions
rf = best_rf

### Applying the model for prediction

In [None]:
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

### Performance evalutaion

In [None]:
rf_train_mpe = mean_position_error(y_train, y_rf_train_pred)
rf_test_mpe = mean_position_error(y_test, y_rf_test_pred)
errors_rf = pd.DataFrame([["Random Forest",rf_train_mpe, rf_test_mpe]], columns= ['Model','Train MPE', 'Test MPE'])

print("Performance evaluation: Random Forest")
print("Training Mpe:", rf_train_mpe)
print("Test Mpe:",rf_test_mpe)

## GradientBoostRegressor
Uses many weak learners to build a strong prediction model.

### Training the model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

# Define the model
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr = MultiOutputRegressor(gbr)
# Fit the model
gbr.fit(X_train, y_train)

### Applying the model for predictioin

In [None]:
y_gbr_train_pred = gbr.predict(X_train)
y_gbr_test_pred = gbr.predict(X_test)

### Performance evaluation

In [None]:
gbr_train_mpe = mean_position_error(y_train, y_gbr_train_pred)
gbr_test_mpe = mean_position_error(y_test, y_gbr_test_pred)
errors_gbr = pd.DataFrame([["Gradient Boost",gbr_train_mpe, gbr_test_mpe]], columns= ['Model','Train MPE', 'Test MPE'])

print("Performance evaluation: Gradient Boost")
print("Training Mpe:", gbr_train_mpe)
print("Test Mpe:",gbr_test_mpe)

## Model comparison

In [None]:
import seaborn as sns
errors_comp = pd.concat([errors_lr, errors_knn, errors_rf, errors_gbr], axis=0)

print(errors_comp)

# Assuming errors_comp is already prepared
errors_comp_melted = errors_comp.melt(id_vars='Model', var_name='DataSet', value_name='MPE')

# Plot
plt.figure(figsize=(8, 8))
ax = sns.barplot(data=errors_comp_melted, x='Model', y='MPE', hue='DataSet', palette='viridis', width=0.7 )

plt.title(f'Mean Positional Error (MPE), {n_datapoints} datapoints')
plt.ylabel('Mean Positional Error (MPE)')
plt.xlabel('ML Model')
plt.yscale('log')  # Using a logarithmic scale to better visualize low values

for container in ax.containers:
    ax.bar_label(container, fmt='%.3fm', label_type='edge', padding=3)

plt.tight_layout()
plt.show()

## Notes

### Model selection
Pros and cons of each model. How can they be applied and trained efficiently.

### Parameter selection
What parameters give the best results and why?

### Dataset
When do we run into problems with overfitting the model?
And for what number of entries do we get the most accurate models?

### Testing
How should the models be tested and how can we confirm that the data is 
'valid' for testing accuracy.

### Visulization
Is there any good alternatives for visualizing the MPE for different models and sizes of 
the dataset