In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

### Cargar datos

In [4]:
df = pd.read_csv("../data/raw/bank-marketing-campaign-data.csv", sep=';')
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Tamaño del Dataset

In [None]:
df.shape

### Identificación del tipo de datos

In [None]:
df.dtypes

### Identificación y conteo de valores nulos

In [None]:
df.isnull().sum()

### Estadistica

In [None]:
df.describe(include='all')

### Visualización

In [None]:

sns.countplot(data=df, x='y')
plt.title("Distribución de la variable objetivo 'y'")
plt.show()
df.select_dtypes(include='number').hist(bins=20, figsize=(15, 10))
plt.suptitle("Histogramas de variables numéricas", fontsize=16)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title("Matriz de correlación entre variables numéricas")
plt.show()


### Regresión logística

In [None]:
df_model = df.copy()

### Convertir variable objetivo a binaria

In [None]:
df_model['y'] = df_model['y'].map({'yes': 1, 'no': 0})

##### Verificamos si la conversión fue exitosa

In [None]:
print(df_model['y'].unique())

In [None]:

# Aplicar one-hot encoding a las variables categóricas (excepto 'y')
df_model = pd.get_dummies(df_model, drop_first=True)

In [None]:
# Verificamos si 'y' todavía está presente
'y' in df_model.columns

In [None]:
# Separar variables predictoras y objetivo
X = df_model.drop('y', axis=1)
y = df_model['y']

In [None]:

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

# Modelo de regresión logística
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


In [None]:

# Predicciones y evaluación
y_pred = model.predict(X_test_scaled)
print("Matriz de Confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:\n", classification_report(y_test, y_pred))