In [13]:
!pip install ucimlrepo




In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo  # To fetch datasets from UCI ML Repository
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler



In [15]:
# Fetch dataset (ID 222 corresponds to Bank Marketing)
bank_marketing = fetch_ucirepo(id=222)

# Extract features (X) and target (y)
X = bank_marketing.data.features
y = bank_marketing.data.targets

print("✅ Dataset Loaded Successfully!")
print("Features shape:", X.shape)
print("Target shape:", y.shape)



✅ Dataset Loaded Successfully!
Features shape: (45211, 16)
Target shape: (45211, 1)


In [16]:
# Select any 4 features (2 numeric + 2 categorical for variety)
selected_features = ['age', 'balance', 'job', 'marital']

# Combine selected features with target
df = pd.concat([X[selected_features], y], axis=1)

print("\n--- Sample Data ---")
print(df.head())




--- Sample Data ---
   age  balance           job  marital   y
0   58     2143    management  married  no
1   44       29    technician   single  no
2   33        2  entrepreneur  married  no
3   47     1506   blue-collar  married  no
4   33        1           NaN   single  no


In [17]:
print("\n--- Missing Values ---")
print(df.isnull().sum())



--- Missing Values ---
age          0
balance      0
job        288
marital      0
y            0
dtype: int64


In [18]:
# Initialize label encoders
label_encoders = {}

# Encode 'job', 'marital', and target 'y'
for col in ['job', 'marital', 'y']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # save encoder for possible inverse transform

print("\n--- Encoded Sample ---")
print(df.head())



--- Encoded Sample ---
   age  balance  job  marital  y
0   58     2143    4        1  0
1   44       29    9        2  0
2   33        2    2        1  0
3   47     1506    1        1  0
4   33        1   11        2  0


In [19]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTraining set size:", X_train.shape)
print("Test set size:", X_test.shape)



Training set size: (36168, 16)
Test set size: (9043, 16)
