# Wifi usage in Cartagena by:

- Jose Andrés Jaramillo
- David Parra
- Daniela Martinez Quiroga
- María Isabella Rodríguez Arévalo


## Libraries

In [None]:
!pip install -U ydata_profiling
!pip install keras-tuner -q

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import tensorflow as tf
import keras_tuner as kt
import numpy as np

from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2
from keras_tuner import HyperParameters

## Load data and report

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/BoredDany/ML-Python/refs/heads/main/datasets/Wifi-Cartagena.csv')
profile = ProfileReport(df, title="Profiling Report")
profile

# **Project 1**

Supervised Learning: Predicting Consumers in a Cartagena Network


### **Analysis**

1. Considering the correlation matrix, all variables except District/Neighborhood and Date have correlations, determining that these variables are not influential in the model. Furthermore, the variables shown in the matrix have a positive relationship (above zero) as they are all in blue tones.
2. Since the numerical variable "consumers" will be predicted according to the idea, our models will predict numerical values.

## Cleaning data

The "Wifi Zone" variable was digitized for easy handling in the models to be created.

In [None]:
# Separate X and Y set
x = df.drop(['Zona Wifi', 'Corregimiento/Barrio', 'Fecha', 'Consumidores'], axis=1)
y = df['Consumidores']

# Delete nulls
x.fillna(x.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)
x['Zona Wifi'] = df['Zona Wifi'].astype('category').cat.codes

# Separate train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

x

### Boxplot

A boxplot will be created to determine if there are any outliers. If there are any, the models to be used will be defined so as not to eliminate the data.

In [None]:
columns = ['Visitas', 'Logins', 'Dispositivos Nuevos', 'Sesiones']
colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3']  # You can change these hex codes

for col, color in zip(columns, colors):
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=x, y=col, color=color)
    plt.title(f'Diagrama de {col}')
    plt.show()

Looking at the boxplots, we can conclude that since there are quite a few outliers in all variables, it is recommended not to eliminate them and continue with models that handle them well. Therefore, the following will be performed: decision trees, ensembles, and neural networks.

## Models

### Decision tree

In [None]:
param = {
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse'],
    'max_depth': [None, 5, 10, 15, 20],
    'max_features': [0.3, 0.5, 0.7],
    'max_leaf_nodes': [None, 5, 10, 15, 20],
    'min_impurity_decrease': [0.0, 0.0001, 0.0002],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [10, 20, 30],
}

random_search_tree = RandomizedSearchCV(DecisionTreeRegressor(), param, cv=5, scoring='neg_mean_squared_error', n_iter=10, random_state=42)
random_search_tree.fit(x_train, y_train)
print(random_search_tree.best_params_)

In [None]:
y_pred_tree_t = random_search_tree.predict(x_test)
y_pred_tree_tr = random_search_tree.predict(x_train)
MSE_train = mean_squared_error(y_train, y_pred_tree_tr)
MSE_test = mean_squared_error(y_test, y_pred_tree_t)
RMSE_train = MSE_train**0.5
RMSE_test = MSE_test**0.5
print(f'Tree\'s MSE test: {MSE_test}')
print(f'Tree\'s MSE train: {MSE_train}')
print(f'Tree\'s RMSE test: {RMSE_test}')
print(f'Tree\'s RMSE train: {RMSE_train}')

#### Decision Tree Analysis

The model generated by the decision tree algorithm yields an error of 6.50 consumers on the test data and 6.08 consumers on the training data. This indicates that the model is not overfitting and that it is on average 6 consumers in error.

### Neural network

In [None]:
def build_model(hp):
  model = Sequential([
        Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=l2(0.001)),
        Dense(units=hp.Int('units_2', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=l2(0.001)),
        Dense(units=hp.Int('units_3', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=l2(0.001)),
        Dense(units=hp.Int('units_4', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=l2(0.001)),
        Dense(units=hp.Int('units_5', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=l2(0.001)),
        Dense(units=1, activation='linear', kernel_regularizer=l2(0.001))
    ])
  model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='mse', metrics=['mae'])
  return model

model_begin = kt.Hyperband(
       build_model,
       objective='val_loss',
       max_epochs=25,
       factor=3,
       seed=42,
       max_consecutive_failed_trials=5
   )

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model_begin.search(x_train, y_train, epochs=10, validation_split=0.2, verbose=0)
best_hps_model = model_begin.get_best_hyperparameters(num_trials=1)[0]
print("Mejores hiperparámetros encontrados:")
for param, value in best_hps_model.values.items():
    print(f"{param}: {value}")

model_final = model_begin.hypermodel.build(best_hps_model)
model_final.fit(x_train, y_train, epochs=25, validation_split=0.2, shuffle=False)

In [None]:
y_pred_mod_t = model_final.predict(x_test)
y_pred_mod_tr = model_final.predict(x_train)

y_pred_mod_t = y_pred_mod_t.argmax(axis=1)
y_pred_mod_tr = y_pred_mod_tr.argmax(axis=1)

MSE_train = mean_squared_error(y_train, y_pred_mod_tr)
MSE_test = mean_squared_error(y_test, y_pred_mod_t)
RMSE_train = MSE_train**0.5
RMSE_test = MSE_test**0.5

print(f'Tree\'s MSE test: {MSE_test}')
print(f'Tree\'s MSE train: {MSE_train}')
print(f'Tree\'s RMSE test: {RMSE_test}')
print(f'Tree\'s RMSE train: {RMSE_train}')

#### Neural Network Analysis

The model generated by the decision tree algorithm yields an error of 33.30 consumers on the test data and 33.65 consumers on the training data. This indicates that the model is not overfitting, but the error is high, with errors of around 33 consumers, indicating that it is not the most suitable.

### Random forest regressor

In [None]:
# Define the parameter grid for Random Forest
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

# Model
rf = RandomForestRegressor(random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   scoring='neg_mean_squared_error', n_iter=20, cv=5, verbose=2,
                                   random_state=42, n_jobs=-1)

# Search hyperparameters
random_search.fit(x_train, y_train)

# Best model
best_rf = random_search.best_estimator_
y_pred_tree_tr = best_rf.predict(x_train)
y_pred_tree_t = best_rf.predict(x_test)

# Evaluation in training and testing
MSE_train = mean_squared_error(y_train, y_pred_tree_tr)
MSE_test = mean_squared_error(y_test, y_pred_tree_t)

RMSE_train = MSE_train**0.5
RMSE_test = MSE_test**0.5
print("RANDOM FOREST CLASSIFIER")
print(f"Mejores hiperparámetros: {random_search.best_params_}")
print(f"MSE train: {MSE_train}")
print(f"MSE test: {MSE_test}")
print(f"RMSE train: {RMSE_train}")
print(f"RMSE test: {RMSE_test}")

In [None]:
import joblib

# Save best model trained
joblib.dump(best_rf, 'random_forest_model.pkl')

## Conclusions

1. The random forest model was selected for its RMSE; both the training and test data are lower compared to the other two models.

2. None of the three models exhibit overfitting, as their training RMSE is not close to zero and the test RMSE is higher.

3. In the neural network, the ideal number of layers was 6, as this is an important factor when evaluating the MSE and RMSE in large numbers.

# **Project 2**

*Objective:* Predict the variable "Wifi Zone" through clustering

## Imports

In [None]:
# PCA
from sklearn.decomposition import PCA

# DBSCAN
from sklearn.cluster import DBSCAN

# Metrics
from sklearn.metrics import silhouette_score

# Scale data
from sklearn.preprocessing import MinMaxScaler

# Gaussian Mixture
from sklearn.mixture import GaussianMixture

# K means
from sklearn.cluster import KMeans

# Hyperparameter search
from sklearn.model_selection import ParameterGrid

# 3D plots
from mpl_toolkits.mplot3d import Axes3D

## Data

In [None]:
x_2 = df.drop(['Zona Wifi','Corregimiento/Barrio','Fecha'], axis=1)
x_2.fillna(x_2.mean(), inplace=True)
x_2['Zona Wifi'] = df['Zona Wifi'].astype('category').cat.codes
scale = MinMaxScaler().fit(x_2)
x_2 = scale.transform(x_2)

## PCA

In this step, we will transform the data with PCA, and therefore identify the ideal number of components using the elbow rule.

In [None]:
# PCA
pca = PCA()
pca.fit(x_2)

# Variance
varianza = np.cumsum(pca.explained_variance_ratio_)

# Diagram
plt.figure(figsize=(8,5))
plt.plot(range(1, len(varianza)+1), varianza, marker='o', linestyle='--')
plt.title('Regla del codo para PCA')
plt.xlabel('Número de componentes principales')
plt.ylabel('Varianza explicada acumulada')
plt.grid(True)
plt.show()


After observing the diagram, for PCA 3 components will be used by the elbow rule

In [None]:
pca_final = PCA(n_components=3)
x_2_pca = pca_final.fit_transform(x_2)

## Models

The models will be trained using DBSCAN, Gaussian mixture, and K Means, keeping in mind that unsupervised models are the preferred approach for this deliverable. For each model, models will be created with the data transformed by PCA and without PCA.

### *DBSCAN*

Below are some hyperparameters used to search for the best models.

In [None]:
param_dbscan = {
    'eps': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
    'min_samples': [4, 5, 6, 7]
}

**Data without PCA**

In [None]:
score = 0
best_params ={}

# Search best hyperparameters
for params in ParameterGrid(param_dbscan):
  print(params)
  dbscan = DBSCAN(**params, metric='manhattan')
  labels = dbscan.fit_predict(x_2)

  # Avoid using silhouette if there is only 1 cluster or all are noise
  if len(set(labels)) <= 1 or (set(labels) == {-1}):
    continue

  score_f = silhouette_score(x_2, labels, metric='manhattan')
  print(f"score: {score_f}")
  if score_f > score:
    score = score_f
    best_params = params

print(f"Mejor valor de eps: {best_params['eps']}")
print(f"Mejor valor de min_samples: {best_params['min_samples']}")

In [None]:
dbscan_no_pca = DBSCAN(**best_params, metric='manhattan')
labels = dbscan_no_pca.fit_predict(x_2)
sil_dbscan_no_pca = silhouette_score(x_2, labels, metric='manhattan')
num_clusters_dbscan = len(set(labels))
num_ruido_dbscan = list(labels).count(-1)
print(f"Número de clusters: {num_clusters_dbscan}")
print(f"Número de elementos en ruido: {num_ruido_dbscan}")
print(f"Silhouette score: {sil_dbscan_no_pca}")

Now with the model we are going to visualize in a diagram how the clusters are displayed.

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(x_2[:,0], x_2[:,1], c=labels, cmap='rainbow')
plt.title('Clusters DBSCAN')
plt.xlabel('Componente 1')
plt.ylabel('Componente 2')
plt.show()

**Data with PCA**

In [None]:
score_pca = 0
best_params_pca ={}

# Search best hyperparameters
for params in ParameterGrid(param_dbscan):
  print(params)
  dbscan = DBSCAN(**params, metric='manhattan')
  labels = dbscan.fit_predict(x_2_pca)

  # Avoid using silhouette if there is only 1 cluster or all are noise
  if len(set(labels)) <= 1 or (set(labels) == {-1}):
    continue

  score_f = silhouette_score(x_2_pca, labels, metric='manhattan')
  print(f"score: {score_f}")
  if score_f > score_pca:
    score_pca = score_f
    best_params_pca = params

print(f"Mejor valor de eps: {best_params_pca['eps']}")
print(f"Mejor valor de min_samples: {best_params_pca['min_samples']}")

In [None]:
dbscan_pca = DBSCAN(**best_params_pca, metric='manhattan')
labels_pca = dbscan_pca.fit_predict(x_2_pca)
sil_dbscan_pca = silhouette_score(x_2_pca, labels_pca, metric='manhattan')
num_clusters_dbscan_pca = len(set(labels_pca))
num_ruido_dbscan_pca = list(labels_pca).count(-1)
print(f"Número de clusters: {num_clusters_dbscan_pca}")
print(f"Número de elementos en ruido: {num_ruido_dbscan_pca}")
print(f"Silhouette score: {sil_dbscan_pca}")

Having a model, we now proceed to graph

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(x_2_pca[:,0], x_2_pca[:,1], c=labels_pca, cmap='rainbow')
plt.title('Clusters DBSCAN con PCA')
plt.xlabel('Componente 1')
plt.ylabel('Componente 2')
plt.show()


Hypothesis for analyzing the effect of PCA on DBSCAN

Null hypothesis (H₀):
The application of dimensionality reduction using PCA does not have a significant impact on the quality of the clustering generated by the DBSCAN algorithm.

Alternative hypothesis (H₁):
The application of PCA significantly improves the quality of the clustering generated by DBSCAN.

| Configuration | Number of clusters | Points classified as noise | Silhouette Score |
| ------------- | ------------------ | ------------------------------ | ---------------- |
| No PCA       | 2                  | 2                              | 0.4827           |
|    PCA       | 2                  | 2                              | 0.4494           |

Conclusion

The results show that both with and without PCA, the DBSCAN algorithm identifies exactly two clusters and classifies two points as noise. However, the Silhouette Score is slightly lower when applying PCA (0.4494 vs. 0.4827), indicating that the clusters formed are slightly less cohesive and less separated after dimensionality reduction.

Therefore, no improvement in clustering quality is evident when applying PCA before using DBSCAN. From these results, we conclude that the null hypothesis (H₀) is not rejected. That is, in this particular case, the use of PCA does not provide a significant performance benefit to the DBSCAN algorithm for Wi-Fi hotspot segmentation.

### *Gaussian Mixture*

In [None]:
param_gaussian = {
    'n_components': [2, 3, 4, 5, 6],
    'covariance_type': ['full', 'tied', 'diag', 'spherical'],
    'random_state': [42]
}

**Data without PCA**

In [None]:
score_g = 0
best_params_g ={}

# Search best hyperparameters
for params in ParameterGrid(param_gaussian):
  print(params)
  gaussian = GaussianMixture(**params)
  labels = gaussian.fit_predict(x_2)

  # Avoid using silhouette if there is only 1 cluster or all are noise
  if len(set(labels)) <= 1 or (set(labels) == {-1}):
    continue

  score_f = silhouette_score(x_2, labels, metric='manhattan')
  print(f"score: {score_f}")
  if score_f > score_g:
    score_g = score_f
    best_params_g = params

print(f"Mejor valor de eps: {best_params_g['n_components']}")
print(f"Mejor valor de min_samples: {best_params_g['covariance_type']}")

In [None]:
gaussian_n = GaussianMixture(**best_params_g)
labels_g= gaussian_n.fit_predict(x_2)
sil_gaussian_no_pca = silhouette_score(x_2, labels_g, metric='manhattan')
num_clusters_gaussian = len(set(labels_g))
num_ruido_gaussian = list(labels_g).count(-1)
print(f"Número de clusters: {num_clusters_gaussian}")
print(f"Número de elementos en ruido: {num_ruido_gaussian}")
print(f"Silhouette score: {sil_gaussian_no_pca}")

With the model, we proceed to visualize the clusters

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(x_2[:,0], x_2[:,1], c=labels_g, cmap='rainbow')
plt.title('Clusters Gaussian Mixture')
plt.xlabel('Componente 1')
plt.ylabel('Componente 2')
plt.show()

**Data with PCA**

In [None]:
score_g_pca = 0
best_params_g_pca ={}

# Search best hyperparameters
for params in ParameterGrid(param_gaussian):
  print(params)
  gaussian = GaussianMixture(**params)
  gaussian.fit(x_2_pca)
  labels = gaussian.predict(x_2_pca)

  # Avoid using silhouette if there is only 1 cluster or all are noise
  if len(set(labels)) <= 1 or (set(labels) == {-1}):
    continue

  score_f = silhouette_score(x_2_pca, labels, metric='manhattan')
  print(f"score: {score_f}")
  if score_f > score_g_pca:
    score_g_pca = score_f
    best_params_g_pca = params

print(f"Mejor valor de eps: {best_params_g_pca['n_components']}")
print(f"Mejor valor de min_samples: {best_params_g_pca['covariance_type']}")

In [None]:
gaussian_pca = GaussianMixture(**best_params_g_pca)
labels_g_pca= gaussian_pca.fit_predict(x_2_pca)
sil_gaussian_pca = silhouette_score(x_2_pca, labels_g_pca, metric='manhattan')
num_clusters_gaussian_pca = len(set(labels_g_pca))
num_ruido_gaussian_pca = list(labels_g_pca).count(-1)
print(f"Número de clusters: {num_clusters_gaussian_pca}")
print(f"Número de elementos en ruido: {num_ruido_gaussian_pca}")
print(f"Silhouette score: {sil_gaussian_pca}")

With the model, we proceed to make the scatter diagram

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_2_pca[:,0], x_2_pca[:,1], x_2_pca[:,2], c=labels_g_pca, cmap='rainbow', s=5)
ax.set_title('Clusters Gaussian Mixture')
ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')
plt.show()

Hypotheses for Gaussian Mixture and PCA

Null hypothesis (H₀):
The application of dimensionality reduction using PCA does not have a significant impact on the quality of the clustering generated by the Gaussian Mixture algorithm.

Alternative hypothesis (H₁):
The application of PCA significantly improves the quality of the clustering generated by Gaussian Mixture.

Observed results

| Configuration | Number of clusters | Noise points | Silhouette score |
| ------------- | ------------------ | ------------ | ---------------- |
| **NO PCA**   | 2                  | 0            | **0.3986**       |
| **WITH PCA** | 3                  | 0            | **0.3862**       |


Analysis and Interpretation

Applying PCA, the model went from 2 to 3 clusters, suggesting a greater ability to identify subgroups.

However, the Silhouette Score decreased slightly, from 0.3986 to 0.3862, indicating that the new clusters are neither more compact nor better separated.

Visually, PCA shows clearer 3D segmentation, although quantitatively it does not represent an improvement in cohesion/separation.

Conclusion

Although the Gaussian Mixture model with PCA was able to detect more clusters, the slight decrease in the Silhouette Score indicates that the clustering quality did not improve significantly.

Therefore, the null hypothesis (H₀) is not rejected. That is, in this case, applying PCA does not contribute to a substantial improvement in the performance of the Gaussian Mixture algorithm in terms of clustering quality.

### *K-Means*

For the next part, since we don't know the ideal number of clusters for the model, we will perform a search using the elbow rule and the silhouette method.

**Data with variance without PCA**

In [None]:
inertias = []
centroids_list = []

for k in range(2, 9):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    label = kmeans.fit_predict(x_2)
    inertias.append(kmeans.inertia_)
    centroids_list.append(kmeans.cluster_centers_)
    print(f'K = {k}')
    print('Centroides:\n', kmeans.cluster_centers_)
    print('Inercia:', kmeans.inertia_)
    print(f"Silhouette score: {silhouette_score(x_2, label, metric='manhattan')}")
    print('-' * 50)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(2, 9), inertias, marker='o')
plt.title('Método del Codo')
plt.xlabel('Número de Clústeres (k)')
plt.ylabel('Inercia')
plt.grid(True)
plt.show()

Thanks to the elbow diagram, we conclude to use 3 components for PCA

In [None]:
kmeans_normal = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_kmeans_normal = kmeans_normal.fit_predict(x_2)
sil_kmeans_no_pca = silhouette_score(x_2, labels_kmeans_normal, metric='manhattan')
num_clusters_kmeans = len(set(labels_kmeans_normal))
num_ruido_kmeans = list(labels_kmeans_normal).count(-1)
print(f"Número de clusters: {num_clusters_kmeans}")
print(f"Número de elementos en ruido: {num_ruido_kmeans}")
print(f"Inercia: {kmeans_normal.inertia_}")
print(f"Silhouette score: {sil_kmeans_no_pca}")

With the model defined, the following scatter diagram is made

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_2[:,0], x_2[:,1], x_2[:,2], c=labels_kmeans_normal, cmap='rainbow', s=5)
ax.set_title('Clusters Gaussian Mixture')
ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')
plt.show()

**Datoa with PCA**

In [None]:
inertias_pca = []
centroids_list_pca = []

for k in range(2, 9):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    label = kmeans.fit_predict(x_2_pca)
    inertias_pca.append(kmeans.inertia_)
    centroids_list_pca.append(kmeans.cluster_centers_)
    print(f'K = {k}')
    print('Centroides:\n', kmeans.cluster_centers_)
    print('Inercia:', kmeans.inertia_)
    print(f"Silhouette score: {silhouette_score(x_2_pca, label, metric='manhattan')}")
    print('-' * 50)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(2, 9), inertias_pca, marker='o')
plt.title('Método del Codo')
plt.xlabel('Número de Clústeres (k)')
plt.ylabel('Inercia')
plt.grid(True)
plt.show()

Thanks to the elbow diagram, we conclude to use 3 components for PCA

In [None]:
kmeans_pca = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_kmeans_pca = kmeans_pca.fit_predict(x_2_pca)
sil_kmeans_pca = silhouette_score(x_2_pca, labels_kmeans_pca, metric='manhattan')
num_clusters_kmeans_pca = len(set(labels_kmeans_pca))
num_ruido_kmeans_pca = list(labels_kmeans_pca).count(-1)
print(f"Número de clusters: {num_clusters_kmeans_pca}")
print(f"Número de elementos en ruido: {num_ruido_kmeans_pca}")
print(f"Inercia: {kmeans_pca.inertia_}")
print(f"Silhouette score: {sil_kmeans_pca}")

With the model, the scatter plot is now made

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_2_pca[:,0], x_2_pca[:,1], x_2_pca[:,2], c=labels_kmeans_pca, cmap='rainbow', s=5)
ax.set_title('Clusters Gaussian Mixture')
ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')
plt.show()

Hypotheses for K-Means and PCA

Null hypothesis (H₀):
The application of dimensionality reduction using PCA does not have a significant impact on the quality of the clustering generated by the K-Means algorithm.

Alternative hypothesis (H₁):
The application of PCA significantly improves the quality of the clustering generated by K-Means.

Observed results

| Configuration | Number of clusters | Noise points | Inertia | Silhouette score |
| ------------- | ------------------ | ------------ | ------- | ---------------- |
| **No PCA**   | 3                  | 0            | 358.17  | 0.3378           |
| **With PCA** | 3                  | 0            | 316.54  | 0.3921           |


Analysis and Interpretation

In both cases, K-Means finds three clusters, with no points labeled as noise.

The use of PCA reduces inertia (from 358.17 to 316.54), implying that the points are closer to their respective centroids after the transformation.

The Silhouette Score increases when applying PCA (from 0.3378 to 0.3921), suggesting that the clusters are more compact and better separated.

Visually, the 3D representation after PCA shows clearer segmentation.

Conclusion

In the case of the K-Means algorithm, the application of PCA improves both the cohesion and separation of the clusters, as evidenced by the reduction in inertia and the increase in the Silhouette Score.

Therefore, the null hypothesis (H₀) is rejected and the alternative hypothesis (H₁) is accepted. That is, in this case, the use of PCA does significantly improve the quality of the clustering generated by the K-Means algorithm.

## **Final Analysis**

In [None]:
import joblib

joblib.dump(gaussian_pca, 'gaussian_pca_model.pkl')

In [None]:
joblib.dump(pca_final, 'pca_model.pkl')

In [None]:
joblib.dump(scale, 'minmax_scaler.pkl')

In [None]:
tabla ={
    'Modelo': ['DBSCAN', 'DBSCAN con PCA', 'Gaussian Mixture', 'Gaussian Mixture con PCA', 'K-Means', 'K-Means con PCA'],
    'Silhouette Score': [sil_dbscan_no_pca, sil_dbscan_pca, sil_gaussian_no_pca, sil_gaussian_pca, sil_kmeans_no_pca, sil_kmeans_pca],
    'Número de clusters': [num_clusters_dbscan, num_clusters_dbscan_pca, num_clusters_gaussian, num_clusters_gaussian_pca, num_clusters_kmeans, num_clusters_kmeans_pca],
    'Número de elementos en ruido': [num_ruido_dbscan, num_ruido_dbscan_pca, num_ruido_gaussian, num_ruido_gaussian_pca, num_ruido_kmeans, num_ruido_kmeans_pca],
}

df_tabla = pd.DataFrame(tabla)
print(df_tabla)

According to the silhouette method, which evaluates the quality of the models and how well the clusters have been grouped, the best model is DBSCAN because it is closest to 1, indicating that it is the best clustered model. However, observing the scatter plots, DBSCAN does not demonstrate an ideal classification of points into clusters by assigning only 2 points to a group; therefore, the k-means with PCA and Gaussian Mixture with PCA models are considered better, since the distribution of elements among the clusters is visualized. With this in mind, we will use the Gaussian Mixture with PCA model because of its ideal classification of elements visually and because it has a better score in the silhouette method than K-means.

Model building hypothesis

| Model | Visits | Logins | New Devices | Sessions | Consumers | Different Wi-Fi Zones | Different Neighborhoods |
| ---------------- | ------- | ------ | ------------------- | -------- | ------------ | -------------------- | ----------------- |
| DBSCAN\_PCA\_0   | 28.44   | 25.83  | 2.72                | 12.06    | 26.72        | 3                    | 3                 |
| DBSCAN\_PCA\_1   | 13.13   | 11.89  | 1.40                | 6.58     | 12.83        | 5                    | 5                 |
| GMM\_PCA\_0      | 18.56   | 17.28  | 1.94                | 8.61     | 17.47        | 2                    | 2                 |
| GMM\_PCA\_1      | 11.08   | 9.81   | 1.19                | 5.53     | 10.81        | 2                    | 2                 |
| GMM\_PCA\_2      | 30.11   | 27.55  | 2.86                | 12.87    | 27.93        | 4                    | 4                 |
| KMeans\_PCA\_0   | 11.08   | 9.81   | 1.19                | 5.53     | 10.81        | 2                    | 2                 |
| KMeans\_PCA\_1   | 30.11   | 27.55  | 2.86                | 12.87    | 27.93        | 4                    | 4                 |
| KMeans\_PCA\_2   | 18.56   | 17.28  | 1.94                | 8.61     | 17.47        | 2                    | 2                 |
| DBSCAN\_noPCA\_0 | 28.44   | 25.83  | 2.72                | 12.06    | 26.72        | 3                    | 3                 |
| DBSCAN\_noPCA\_1 | 13.13   | 11.89  | 1.40                | 6.58     | 12.83        | 5                    | 5                 |
| GMM\_noPCA\_0    | 30.11   | 27.55  | 2.86                | 12.87    | 27.93        | 4                    | 4                 |
| GMM\_noPCA\_1    | 18.56   | 17.28  | 1.94                | 8.61     | 17.47        | 2                    | 2                 |
| GMM\_noPCA\_2    | 11.08   | 9.81   | 1.19                | 5.53     | 10.81        | 2                    | 2                 |
| KMeans\_noPCA\_0 | 11.08   | 9.81   | 1.19                | 5.53     | 10.81        | 2                    | 2                 |
| KMeans\_noPCA\_1 | 18.56   | 17.28  | 1.94                | 8.61     | 17.47        | 2                    | 2                 |
| KMeans\_noPCA\_2 | 30.11   | 27.55  | 2.86                | 12.87    | 27.93        | 4                    | 4                 |
