**BIBLIOTECAS NECESARIAS**

In [1]:
# Bibliotecas para operaciones numéricas y manejo de datos
import numpy as np  # Operaciones numéricas
import pandas as pd  # Manejo de datos en formato tabular

# Herramientas para preprocesamiento de datos
from sklearn.preprocessing import StandardScaler  # Escalado de características para normalizar datos
from imblearn.under_sampling import RandomUnderSampler

**BASE DE DATOS 1**

In [2]:
# CARGAR EL DATASET DESDE UN ARCHIVO CSV
trans_fraud = pd.read_csv('C:\\Users\\mmene\\OneDrive\\Escritorio\\RESULTADOS PROYECTO\\DATA\\credit_card_fraud_detection.csv')
trans_fraud.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [None]:
# PREPROCESAMIENTO Y ESCALADO DE CARACTERÍSTICAS PARA EL CONJUNTO DE DATOS

# Paso 1: Separar las características (X) de la variable objetivo (y)
# 'X' contiene todas las columnas excepto 'Class', que es la etiqueta objetivo
X = trans_fraud.drop(['Class'], axis=1)  
# 'y' almacena la columna 'Class', que representa las etiquetas de clase (objetivo)
y = trans_fraud['Class']  

# Paso 2: Crear un objeto 'StandardScaler' para normalizar las características
# Este objeto ajustará cada característica para que tenga una media de 0 y una desviación estándar de 1
scaler = StandardScaler()

# Paso 3: Ajustar y transformar los datos en un solo paso para escalarlos
# 'fit_transform' ajusta el scaler a los datos y luego los transforma para normalizarlos
scaled = scaler.fit_transform(X)

# Paso 4: Convertir los datos escalados a un DataFrame para conservar los nombres originales de las columnas
# Esto asegura que las características escaladas sean fácilmente identificables
X_scaled = pd.DataFrame(scaled, columns=X.columns)

# Paso 5: Reconstruir el conjunto de datos uniendo las características escaladas con la variable objetivo
# Se utiliza 'pd.concat' para combinar el DataFrame escalado (X_scaled) con la columna de etiquetas (y)
# 'reset_index(drop=True)' asegura que los índices coincidan entre ambos DataFrames
trans_fraud_scaled = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)

# Paso 6: Verificar la transformación visualizando las primeras filas del conjunto de datos escalado
# Esto ayuda a confirmar que las características han sido escaladas correctamente y que el DataFrame está completo
trans_fraud_scaled.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.996583,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,0.244964,0
1,-1.996583,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,-0.342475,0
2,-1.996562,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,1.160686,0
3,-1.996562,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,0.140534,0
4,-1.996541,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,-0.073403,0


In [4]:
# DISTRIBUCIÓN DE CLASES INICIAL
print(trans_fraud_scaled['Class'].value_counts()) # Ver la distribución de clases

Class
0    284315
1       492
Name: count, dtype: int64


In [5]:
# APLICACIÓN DE TÉCNICA DE REMUESTREO (UNDERSAMPLING) PARA BALANCEAR CLASES EN EL CONJUNTO DE DATOS

# Inicializar RandomUnderSampler con una semilla fija para reproducibilidad
rus = RandomUnderSampler(random_state=42)

# Aplicar el remuestreo (undersampling) a las características y la clase
X_resampled, y_resampled = rus.fit_resample(X_scaled, y)

# Crear un DataFrame con los datos remuestreados
trans_fraud_resampled = pd.DataFrame(X_resampled, columns=X.columns)

# Añadir la columna 'Class' con las etiquetas de las clases remuestreadas
trans_fraud_resampled['Class'] = y_resampled

# Mostrar las primeras filas del nuevo DataFrame y la distribución de las clases
trans_fraud_resampled.head()  # Visualizar las primeras filas

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
138028,-0.260357,0.671131,0.357682,-0.439632,0.506096,0.218786,-0.844775,0.314351,-0.241461,-0.120275,...,-0.231861,-0.592056,-0.226341,-0.330547,1.226778,0.828399,-0.08503,0.096014,-0.350191,0
63099,-0.932021,-0.407758,0.71767,0.596568,0.490572,0.158697,-0.239663,0.400323,0.116606,-0.691965,...,0.2754,0.797436,-0.14772,0.022658,-0.472812,-0.788131,-0.981167,-0.342038,-0.336517,0
73411,-0.835765,-0.199689,-0.148694,0.740031,-0.924328,-0.463607,0.006514,-0.566897,-0.02287,-2.392847,...,-0.181731,0.161779,-0.307062,-0.806811,-0.594259,0.016798,0.405608,0.725824,-0.293258,0
164247,0.458181,-0.030787,0.645001,-0.651224,-0.020883,0.127786,-1.012212,0.62699,0.112901,-0.136292,...,0.484091,1.250612,-0.029553,-0.208487,-0.652096,-0.311649,-0.058553,0.12824,-0.125338,0
148999,-0.092231,0.943707,0.226102,0.17759,2.730794,0.063802,0.728417,-0.583583,0.197583,0.62213,...,0.140993,0.855661,0.315596,1.143229,-0.3962,-0.044228,-0.049111,-0.129306,-0.353229,0


In [6]:
# DISTRIBUCIÓN DE CLASES FINAL CON RUS
print(trans_fraud_resampled['Class'].value_counts())  # Ver la distribución de clases

Class
0    492
1    492
Name: count, dtype: int64


In [9]:
# Guarda el DataFrame como un archivo Excel
trans_fraud_resampled.to_excel("credit_card_fraud_detection_RUS.xlsx", index=False, engine='openpyxl')

**BASE DE DATOS 2**

In [12]:
# CARGAR EL DATASET DESDE UN ARCHIVO CSV
default_credit = pd.read_csv('C:\\Users\\mmene\\OneDrive\\Escritorio\\RESULTADOS PROYECTO\\DATA\\default_of_credit_card_clients.csv', sep=";")
default_credit.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
9,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0


In [13]:
# PREPROCESAMIENTO Y ESCALADO DE CARACTERÍSTICAS PARA EL CONJUNTO DE DATOS

# Paso 1: Separar las características (X) de la variable objetivo (y)
# 'X_2' contiene todas las columnas excepto 'Y', que es la etiqueta objetivo
X_2 = default_credit.drop(['Y'], axis=1)  
# 'y_2' almacena la columna 'Y', que representa las etiquetas de clase (objetivo)
y_2 = default_credit['Y']  

# Paso 2: Crear un objeto 'StandardScaler' para normalizar las características
# Este objeto ajustará cada característica para que tenga una media de 0 y una desviación estándar de 1
scaler = StandardScaler()

# Paso 3: Ajustar y transformar los datos en un solo paso para escalarlos
# 'fit_transform' ajusta el scaler a los datos y luego los transforma para normalizarlos
scaled_2 = scaler.fit_transform(X_2)

# Paso 4: Convertir los datos escalados a un DataFrame para conservar los nombres originales de las columnas
# Esto asegura que las características escaladas sean fácilmente identificables
X_scaled_2 = pd.DataFrame(scaled_2, columns=X_2.columns)

# Paso 5: Reconstruir el conjunto de datos uniendo las características escaladas con la variable objetivo
# Se utiliza 'pd.concat' para combinar el DataFrame escalado (X_scaled_2) con la columna de etiquetas (y_2)
# 'reset_index(drop=True)' asegura que los índices coincidan entre ambos DataFrames
default_credit_scaled = pd.concat([X_scaled_2, y_2.reset_index(drop=True)], axis=1)

# Paso 6: Verificar la transformación visualizando las primeras filas del conjunto de datos escalado
# Esto ayuda a confirmar que las características han sido escaladas correctamente y que el DataFrame está completo
default_credit_scaled.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,-1.13672,0.810161,0.185828,-1.057295,-1.24602,1.794564,1.782348,-0.696663,-0.666599,-1.530046,...,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382,1
1,-0.365981,0.810161,0.185828,0.858557,-1.029047,-0.874991,1.782348,0.138865,0.188746,0.234917,...,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878,1
2,-0.597202,0.810161,0.185828,0.858557,-0.161156,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122,0
3,-0.905498,0.810161,0.185828,-1.057295,0.164303,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713,0
4,-0.905498,-1.234323,0.185828,-1.057295,2.334029,-0.874991,0.111736,-0.696663,0.188746,0.234917,...,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187,0


In [15]:
# DISTRIBUCIÓN DE CLASES INICIAL
print(default_credit_scaled['Y'].value_counts()) # Ver la distribución de clases

Y
0    23364
1     6636
Name: count, dtype: int64


In [17]:
# APLICACIÓN DE TÉCNICA DE REMUESTREO (UNDERSAMPLING) PARA BALANCEAR CLASES EN EL CONJUNTO DE DATOS

# Inicializar RandomUnderSampler con una semilla fija para reproducibilidad
rus = RandomUnderSampler(random_state=42)

# Aplicar el remuestreo (undersampling) a las características y la clase
X_resampled_2, y_resampled_2 = rus.fit_resample(X_scaled_2, y_2)

# Crear un DataFrame con los datos remuestreados
default_credit_resampled = pd.DataFrame(X_resampled_2, columns=X_2.columns)

# Añadir la columna 'Class' con las etiquetas de las clases remuestreadas
default_credit_resampled['y_2'] = y_resampled_2

# Mostrar las primeras filas del nuevo DataFrame y la distribución de las clases
default_credit_resampled.head()  # Visualizar las primeras filas

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,y_2
7510,1.637943,0.810161,1.451114,0.858557,-0.486615,-0.874991,-0.72357,-0.696663,-0.666599,-0.647565,...,-0.499224,-0.457733,-0.423013,0.215928,0.404151,0.339435,0.489402,0.585717,0.722584,0
15325,0.558907,0.810161,0.185828,-1.057295,-0.05267,-1.764843,-1.558876,-1.532192,-1.521944,-1.530046,...,-0.584858,-0.620589,-0.583425,0.103328,-0.043973,0.023418,-0.142927,-0.04401,-0.015385,0
18666,-0.905498,0.810161,-1.079457,0.858557,-1.354506,-0.874991,-0.72357,-1.532192,-1.521944,-1.530046,...,-0.672497,-0.663059,-0.652724,-0.341942,-0.25699,-0.296801,-0.308063,-0.314136,-0.293382,0
7494,1.252573,-1.234323,-1.079457,0.858557,-0.378129,0.014861,0.111736,0.138865,0.188746,0.234917,...,1.526307,1.189575,1.383139,-0.009876,-0.052003,0.01558,-0.052731,0.386215,-0.040248,0
1239,-0.674276,0.810161,0.185828,-1.057295,-0.05267,-0.874991,-0.72357,-0.696663,-0.666599,-0.647565,...,-0.666342,-0.656545,-0.646075,0.249497,0.326462,-0.27431,-0.282785,-0.314136,-0.271106,0


In [23]:
# DISTRIBUCIÓN DE CLASES FINAL CON RUS
print(default_credit_resampled['y_2'].value_counts())  # Ver la distribución de clases

y_2
0    6636
1    6636
Name: count, dtype: int64


In [24]:
# Guarda el DataFrame como un archivo Excel
default_credit_resampled.to_excel("default_of_credit_card_clients_RUS.xlsx", index=False, engine='openpyxl')