## Validation Dataframe (100 FILES)

In [1]:
import dask
import dask.dataframe as dd
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler


def perform_tasks_with_dask(file_pattern, num_workers=16, threads_per_worker=12, use_gpu=True):
    """
    Perform tasks using Dask, such as reading CSV files, configuring Dask, and printing the first 5 rows.

    :param file_pattern: A file pattern to match CSV files.
    :param num_workers: The number of Dask workers to use.
    :param threads_per_worker: The number of threads per Dask worker.
    :param use_gpu: Whether to use GPU for Dask computations.
    :return: A Dask DataFrame containing the data.
    """
    dask.config.set(scheduler='threads', num_workers=num_workers, threads_per_worker=threads_per_worker, use_gpu=use_gpu)

    df = dd.read_csv(file_pattern, assume_missing=True, header=0)
    df = df.reset_index()

    # Specify the columns you want to keep
    columns_to_keep = ['col5_float', 'col7_float', 'col13_float', 'col15_float', 'col23_float', 'col27_float', 'stableCruise_boolean'] 
    # Use square brackets to select and keep the specified columns
    df_subset = df[columns_to_keep]
    return df_subset
    # numero_de_filas_exacto = len(df)
    # print("Número de filas exacto:", numero_de_filas_exacto)

if __name__ == "__main__":
    file_pattern = "../../Data/V_Data/*.csv"
    df = perform_tasks_with_dask(file_pattern)

    print(df.compute().head(10))

   col5_float  col7_float  col13_float  col15_float  col23_float  col27_float  \
0    0.000000    0.000000     0.590000     8.500000    15.000000        101.0   
1   59.596924    0.000000     0.000000    22.092905    14.770059        100.0   
2   59.596924    0.000000     0.000000    22.092905    14.770059        100.0   
3   83.054077  389.560852     0.019531    22.092905    14.859506        100.0   
4   83.054077  389.560852     0.019531    22.092905    14.859506        100.0   
5   83.054077   12.898486     0.019531    22.092905    14.859506        100.0   
6   83.054077   12.898486     0.019531    22.092905    14.859506        100.0   
7   83.054077   12.898486     0.019531    22.092905    14.859506        100.0   
8   83.054077   12.898486     0.019531    22.092905    14.859506        100.0   
9   83.054077   12.898486     0.019531    22.092905    14.859506        100.0   

   stableCruise_boolean  
0                   0.0  
1                   0.0  
2                   0.0  
3   

In [2]:
X_val = df.compute()[['col5_float', 'col7_float', 'col13_float', 'col15_float', 'col23_float', 'col27_float']]
y_val = df.compute()['stableCruise_boolean']

## Winsorization for outliers

In [3]:
def winsorize_variable(data, variable, new_column_name, lower_percentile=0.01, upper_percentile=0.99):
    """
    Realiza la winsorización de una variable en un conjunto de datos y la almacena en una nueva columna.

    Args:
        data (pd.DataFrame): El DataFrame que contiene los datos.
        variable (str): El nombre de la variable que se va a winsorizar.
        new_column_name (str): El nombre para la nueva columna que almacenará los datos winsorizados (si no se proporciona, se usará el nombre de la variable original con "_winsorized").
        lower_percentile (float): Percentil inferior para la winsorización (valor predeterminado es 0.01).
        upper_percentile (float): Percentil superior para la winsorización (valor predeterminado es 0.99).
    """
    if new_column_name is None:
        new_column_name = variable + "_winsorized"

    lower_limit = data[variable].quantile(lower_percentile)
    upper_limit = data[variable].quantile(upper_percentile)

    data[new_column_name] = data[variable].clip(lower_limit, upper_limit)

In [4]:
winsorize_variable(X_val, "col7_float", "wzcol7_float")

In [5]:
X_val = X_val.drop('col7_float', axis=1)

## MODELO 1 (Logistic)

In [6]:
import joblib
model_filename = 'L2_Undersamplin_logistic_7var_model2.pkl'
loaded_model1 = joblib.load(model_filename)

In [7]:
# Make predictions on the validation set (if available)
y_pred_val = loaded_model1.predict(X_val)

# Calculate the accuracy of the model on the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)

# Print the accuracy on the validation set
print("\nModel accuracy on the validation set:", accuracy_val)

# Display the confusion matrix and classification report for the validation set
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)

print('Confusion Matrix for the validation set:')
print(conf_matrix)
print('\nClassification Report for the validation set:')
print(class_report)


Model accuracy on the validation set: 0.8081465231092604
Confusion Matrix for the validation set:
[[10673753  2530542]
 [    6542    13234]]

Classification Report for the validation set:
              precision    recall  f1-score   support

         0.0       1.00      0.81      0.89  13204295
         1.0       0.01      0.67      0.01     19776

    accuracy                           0.81  13224071
   macro avg       0.50      0.74      0.45  13224071
weighted avg       1.00      0.81      0.89  13224071



## Modelo 2 (XGBoost)

In [8]:
import joblib
model_filename = 'L2_Undersamplin_xgboost_7var_model2.pkl'
loaded_model2 = joblib.load(model_filename)

In [9]:
# Make predictions on the validation set (if available)
y_pred_val = loaded_model2.predict(X_val)

# Calculate the validation set accuracy
accuracy_val = accuracy_score(y_val, y_pred_val)

# Print the validation set accuracy
print("\nValidation set accuracy:", accuracy_val)

# Display the confusion matrix and classification report for the validation set
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)

print('Confusion Matrix for the validation set:')
print(conf_matrix)
print('\nClassification Report for the validation set:')
print(class_report)


Validation set accuracy: 0.9621413103423295
Confusion Matrix for the validation set:
[[12706882   497413]
 [    3233    16543]]

Classification Report for the validation set:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98  13204295
         1.0       0.03      0.84      0.06     19776

    accuracy                           0.96  13224071
   macro avg       0.52      0.90      0.52  13224071
weighted avg       1.00      0.96      0.98  13224071



## Modelo 3 (Deep Learning)

In [10]:
from keras.models import load_model
loaded_model3 = load_model('deepLearning_undersampling_model2.keras')

In [11]:
# Make predictions on the validation set
y_pred_val = (loaded_model3.predict(X_val) > 0.5).astype(int)

# Calculate the accuracy of the model on the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)

# Display the confusion matrix and classification report for the validation set
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)

print('Confusion Matrix for the validation set:')
print(conf_matrix)
print('\nClassification Report for the validation set:')
print(class_report)

print('Accuracy for the validation set:', accuracy_val)

Confusion Matrix for the validation set:
[[10359157  2845138]
 [    1600    18176]]

Classification Report for the validation set:
              precision    recall  f1-score   support

         0.0       1.00      0.78      0.88  13204295
         1.0       0.01      0.92      0.01     19776

    accuracy                           0.78  13224071
   macro avg       0.50      0.85      0.45  13224071
weighted avg       1.00      0.78      0.88  13224071

Accuracy for the validation set: 0.7847305871240406
