In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from pycaret.classification import * # Machine learning tools
import imblearn  # Resampling
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import CondensedNearestNeighbour 
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.metrics import ConfusionMatrixDisplay # Model evaluation
import matplotlib.pyplot as plt # Visualization
import seaborn as sns
from sklearn.metrics import classification_report # Model evaluation report
from ipywidgets import widgets # Interactive functions
from sklearn.metrics import classification_report

In [None]:
#Cargamos el dataset desde nuestro directorio de trabajo
#Working directory
#df = pd.read_csv("DATA/20240812_vortex_trainning_float_formmated.csv", encoding='latin-1') 
df = pd.read_excel('./DATA/20240812_vortex_trainning_float_formmated.xlsx', engine='openpyxl')
df.info()

In [None]:
# Eliminamos columnas innecesarias para nuestro experimiento
#Some basic Feature engineering
data = df.drop(df.iloc[:, : 22], axis=1)
#Comprobamos que no hay valores faltantes
# Checking missing values
data.isnull().any().any()

In [None]:
#data = data.drop(['suma'], axis=1)
data['Site'] = df['Site']
data['id'] = df['ID']
data.info()

In [None]:
# 1. Comprobar si hay duplicados
duplicados = data['id'].duplicated().sum()
print(f"Número de ids duplicados: {duplicados}")

# 2. Ver los ids duplicados (si existen)
if duplicados > 0:
    print("IDs duplicados:")
    print(data[data['id'].duplicated(keep=False)]['id'].sort_values())

# 3. Eliminar duplicados, manteniendo la primera aparición
data_sin_duplicados = data.drop_duplicates(subset='id', keep='first')

# 4. Verificar que se eliminaron los duplicados
print(f"Tamaño original de data: {data.shape}")
print(f"Tamaño de data sin duplicados: {data_sin_duplicados.shape}")

# 5. Comprobar nuevamente que no hay duplicados
duplicados_restantes = data_sin_duplicados['id'].duplicated().sum()
print(f"Número de ids duplicados restantes: {duplicados_restantes}")

# 6. Si estás satisfecho con el resultado, puedes asignar el resultado de vuelta a 'data'
data = data_sin_duplicados

# 7. Reiniciar los índices si es necesario
data.reset_index(drop=True, inplace=True)

In [None]:
data.id.nunique()

We will create a random feature to be used as a threshold for choosing important features later on.

In [None]:
data.info()
data.columns

In [None]:
# Distribución en el target
target = data['Site'].value_counts()
target

In [None]:
# Eliminación de casos. Clases <10 para el target_Group
# Case deletion for classes >10
casos = target[target <10]
values = casos.index
values
deleted_cases= data[data['Site'].isin(values)]
case_del = deleted_cases.index
data.drop(case_del, inplace=True)

In [None]:
data['Site'].value_counts()

In [None]:
data.to_excel('./DATA/FinalTrainingData.xlsx')

In [None]:
# Final Validation Set(VFS)
data_trainning = data.sample(frac=0.90, random_state=786)
fvs = data.drop(data_trainning.index)
data_trainning.reset_index(inplace=True, drop=True)
fvs.reset_index(inplace=True, drop=True)
print('Data for further procedures: ' + str(data_trainning.shape))
print('Data for final validation: ' + str(fvs.shape))


In [None]:
fvs.to_csv('./DATA/FVS.csv')

In [None]:
vs1.Site.value_counts()

In [None]:
data_trainning.Site.value_counts()

#### Creating our Classification function:

In [None]:
X = data_trainning.drop(['Site', 'id'], axis=1)
y = data_trainning['Site']

#### Data Aumentagtion of minority classes (SMOTE)
We carry out the Synthetic Minority Oversampling Technique

In [None]:
# oversampling strategy
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

y_smote.value_counts().plot(kind='bar')
#plt.savefig('SMOTE_distribution.png',dpi=200, bbox_inches='tight')
y_smote.value_counts()

#### We have created a balanced synthetic data set with 752 cases per class.

In [None]:
# Creating final resampled dataset
smote_data = X_smote
smote_data['Site'] = y_smote

In [None]:
# Final dataset shape after resampling
smote_data.shape
smote_data.to_excel('./DATA/20241011FinalResampledTrainingDataset.xlsx')
smote_data.info()