In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


url = "https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv"
total_data = pd.read_csv(url, sep=";")  
total_data.shape

(41188, 21)

In [11]:
# Variables numéricas
numericas = total_data.select_dtypes(include=['number']).columns.tolist()
print("Variables numéricas:", numericas)

# Variables no numéricas (categóricas, texto, etc.)
no_numericas = total_data.select_dtypes(exclude=['number']).columns.tolist()
print("Variables no numéricas:", no_numericas)

Variables numéricas: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Variables no numéricas: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']


In [12]:
# DATASET CON OUTLAIERS
total_data.drop(['contact','day_of_week','month','duration','nr.employed','previous','pdays'], axis = 1, inplace = True)
total_data.shape


(41188, 14)

In [13]:
# DATASET SIN OUTLAIERS 

# Crea una copia del dataset original
total_data_no_outliers = total_data.copy()

# Reemplaza outliers con los valores limites para cada variable numerica
for var in numericas:
    Q1 = total_data_no_outliers[var].quantile(0.25)
    Q3 = total_data_no_outliers[var].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Reemplaza valores fuera de los limites
    total_data_no_outliers[var] = total_data_no_outliers[var].clip(lower=lower_bound, upper=upper_bound)

print("Shape original:", total_data.shape)
print("Shape sin outliers:", total_data_no_outliers.shape)



KeyError: 'duration'

In [None]:
# DATASET CON OUTLAIERS FACTORIZADO
total_data_factorized = total_data.copy()

for col in no_numericas:
    total_data_factorized[col], _ = pd.factorize(total_data_factorized[col])

total_data_factorized.shape


KeyError: 'contact'

In [None]:
# DATASET SIN OUTLAIERS FACTORIZADO
total_data_no_outliers_factorized = total_data_no_outliers.copy()

# Factorizo solo las columnas categoricas indicadas en no_numericas
for col in no_numericas:
    total_data_no_outliers_factorized[col], _ = pd.factorize(total_data_no_outliers_factorized[col])

In [None]:
# DATASET TRAIN/TEST CON OUTLAIERS
X = total_data.drop(columns=['y'])
y = total_data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# DATASET TRAIN/TEST SIN OUTLAIERS
X = total_data_no_outliers.drop(columns=['y'])
y = total_data_no_outliers['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# DATASET TRAIN/TEST FACTORIZADO CON OUTLAIERS
X = total_data_factorized.drop(columns=['y'])
y = total_data_factorized['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# DATASET TRAIN/TEST FACTORIZADO SIN OUTLAIERS
X = total_data_no_outliers_factorized.drop(columns=['y'])
y = total_data_no_outliers_factorized['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# DATASET CON OUTLAIERS ESTANDARIZADO 

total_data_standard = total_data.copy()

# columnas numericas excepto la target 'y'
cols_to_scale = total_data_standard.drop(columns=['y']).select_dtypes(include=['number']).columns

# Inicializar scaler
scaler = StandardScaler()

# Aplicar scaler solo a las columnas numéricas
total_data_standard[cols_to_scale] = scaler.fit_transform(total_data_standard[cols_to_scale])

In [None]:
# DATASET SIN OUTLAIERS ESTANDARIZADO 

# Crear copia
total_data_no_outliers_standard = total_data_no_outliers.copy()

# Seleccionar columnas numéricas excepto la variable objetivo 'y'
cols_to_scale = total_data_no_outliers_standard.drop(columns=['y']).select_dtypes(include=['number']).columns


scaler = StandardScaler()

# Aplicar scaler solo a las columnas numéricas
total_data_no_outliers_standard[cols_to_scale] = scaler.fit_transform(total_data_no_outliers_standard[cols_to_scale])

In [None]:
# DATASET CON OUTLAIERS FACTORIZADO ESTANDARIZADO

total_data_factorized_standard = total_data_factorized.copy()

# Selecciono columnas numericas excepto la variable objetivo 'y'
cols_to_scale = total_data_factorized_standard.drop(columns=['y']).select_dtypes(include=['number']).columns

scaler = StandardScaler()

# Aplicar scaler solo a las columnas numericas
total_data_factorized_standard[cols_to_scale] = scaler.fit_transform(total_data_factorized_standard[cols_to_scale])

NameError: name 'total_data_factorized' is not defined

In [None]:
# DATASET SIN OUTLAIERS FACTORIZADO ESTANDARIZADO
total_data_no_outliers_factorized_standard = total_data_no_outliers_factorized.copy()

# Selecciono columnas numericas excepto la variable objetivo 'y'
cols_to_scale = total_data_no_outliers_factorized_standard.drop(columns=['y']).select_dtypes(include=['number']).columns
scaler = StandardScaler()
# Aplicar scaler solo a las columnas numéricas
total_data_no_outliers_factorized_standard[cols_to_scale] = scaler.fit_transform(total_data_no_outliers_factorized_standard[cols_to_scale])

In [None]:
# DATASET CON OUTLAIERS MIN/MAX 
total_data_minmax = total_data.copy()
cols_to_scale = total_data_minmax.drop(columns=['y']).select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
total_data_minmax[cols_to_scale] = scaler.fit_transform(total_data_minmax[cols_to_scale])

In [None]:
# DATASET SIN OUTLAIERS MIN/MAX 
total_data_no_outliers_minmax = total_data_no_outliers.copy()
cols_to_scale = total_data_no_outliers_minmax.drop(columns=['y']).select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
total_data_no_outliers_minmax[cols_to_scale] = scaler.fit_transform(total_data_no_outliers_minmax[cols_to_scale])

In [None]:
# DATASET CON OUTLAIERS FACTORIZADO MIN/MAX
total_data_factorized_minmax = total_data_factorized.copy()
cols_to_scale = total_data_factorized_minmax.drop(columns=['y']).select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
total_data_factorized_minmax[cols_to_scale] = scaler.fit_transform(total_data_factorized_minmax[cols_to_scale])

In [None]:
# DATASET SIN OUTLAIERS FACTORIZADO MIN/MAX
total_data_no_outliers_factorized_minmax = total_data_no_outliers_factorized.copy()
cols_to_scale = total_data_no_outliers_factorized_minmax.drop(columns=['y']).select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
total_data_no_outliers_factorized_minmax[cols_to_scale] = scaler.fit_transform(total_data_no_outliers_factorized_minmax[cols_to_scale])

In [None]:
#Verfico el tamaño
print(total_data.shape) # dataset con outlaiers
print(total_data_no_outliers.shape) # dataset sin outlaiers
print(total_data_factorized.shape) # dataset con outlaiers factorizado 
print(total_data_no_outliers_factorized.shape) # dataset sin outlaiers factorizado 
#train / test total_data # dataset con outlaiers
#train / test total_data_no_outliers # dataset sin outlaiers
#train / test total_data_factorized # dataset con outlaiers factorizado 
#train / test total_data_no_outliers_factorized # dataset sin outlaiers factorizado 
print(total_data_standard.shape)
print(total_data_no_outliers_standard.shape)
print(total_data_factorized_standard.shape)
print(total_data_no_outliers_factorized_standard.shape)
print(total_data_minmax.shape)
print(total_data_no_outliers_minmax.shape)
print(total_data_factorized_minmax.shape)
print(total_data_no_outliers_factorized_minmax.shape)



(41188, 14)
(41188, 14)
(41188, 14)
(41188, 14)
(41188, 14)
(41188, 14)


NameError: name 'total_data_factorized_standard' is not defined

In [None]:
# Train test total_data_factorized y total_data_no_outliers_factorized
# Para total_data_factorized
X_factorized = total_data_factorized.drop(columns=['y'])
y_factorized = total_data_factorized['y']

X_train_factorized, X_test_factorized, y_train_factorized, y_test_factorized = train_test_split(
    X_factorized, y_factorized, test_size=0.2, random_state=42
)

# Para total_data_no_outliers_factorized
X_no_outliers = total_data_no_outliers_factorized.drop(columns=['y'])
y_no_outliers = total_data_no_outliers_factorized['y']

X_train_no_outliers, X_test_no_outliers, y_train_no_outliers, y_test_no_outliers = train_test_split(
    X_no_outliers, y_no_outliers, test_size=0.2, random_state=42
)