<h2>1. Data Loading and Preprocessing</h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

In [24]:
#fetching the dataset
dataset = fetch_ucirepo(id=222)

#loading the dataset into a pandas dataframe
X = dataset.data.features
y = dataset.data.targets

# metadata
print("Metadata:", dataset.metadata)

# variable information
vars_df = dataset.variables.sort_values(by="role")
print("Variables:\n", vars_df.to_string(index=False))


Metadata: {'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to pre

In [25]:
X.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
5,35,management,married,tertiary,no,231,yes,no,,5,may,139,1,-1,0,
6,28,management,single,tertiary,no,447,yes,yes,,5,may,217,1,-1,0,
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,,5,may,380,1,-1,0,
8,58,retired,married,primary,no,121,yes,no,,5,may,50,1,-1,0,
9,43,technician,single,secondary,no,593,yes,no,,5,may,55,1,-1,0,


In [26]:
# Convert target DataFrame to Series
y = y.iloc[:, 0]

# Encode target
y = y.map({'no': 0, 'yes': 1})


In [27]:
print(f"Total samples: {X.shape[0]}")
print(f"Total features: {X.shape[1]}")
print(f"Target classes:\n{y.value_counts()}")

Total samples: 45211
Total features: 16
Target classes:
y
0    39922
1     5289
Name: count, dtype: int64


Identifying Categorical and Numerical Columns

In [28]:
# Features already in X
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)


Categorical: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical: ['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


Handling Null Data

In [29]:
print(X.isnull().sum())

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
dtype: int64


In [30]:
missing_cols = X.columns[X.isnull().any()].tolist()
print(missing_cols)

['job', 'education', 'contact', 'poutcome']


In [31]:
# Fill missing values with "unknown" as all of them are categorical
X[missing_cols] = X[missing_cols].fillna("unknown")

Train-Test Split

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)