In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [5]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# Data Cleaning


# Handle Missing Values

## Education


In [6]:
# X['job'].fillna(X['job'].mode, inplace = True)
# X['education'].fillna(X['education'].mode, inplace = True)
# X.isna().sum()

# Initialize LabelEncoder
le = LabelEncoder()

# Create a copy of the original 'education' column
X['education_original'] = X['education']
X['marital_original'] = X['marital']
X['day_of_week_original'] = X['day_of_week']
X['month_original'] = X['month']
X['default_original'] = X['default']
X['housing_original'] = X['housing']
X['loan_original'] = X['loan']

# Converting non-numeric features to numeric
X['marital'] = le.fit_transform(X['marital'].astype(str))
X['day_of_week'] = le.fit_transform(X['day_of_week'].astype(str))
X['month'] = le.fit_transform(X['month'].astype(str))
X['default'] = le.fit_transform(X['default'].astype(str))
X['housing'] = le.fit_transform(X['housing'].astype(str))
X['loan'] = le.fit_transform(X['loan'].astype(str))

# Apply Label Encoding to the 'job' column (NaN values will remain NaN)
X['education'] = le.fit_transform(X['education'].astype(str))

# Display the encoded values
print(X[['education', 'education_original']].head())

   education education_original
0          3           tertiary
1          2          secondary
2          2          secondary
3          0                NaN
4          0                NaN


In [7]:
# Select features that could be relevant to job for imputation
features_for_imputation = ['age', 'default', 'balance', 'housing', 'loan', 'day_of_week', 'month', 'duration', 'campaign', 'pdays', 'previous', 'marital', 'education']

# Dataframe with selected features
data_subset = X[features_for_imputation]

# Initialize KNNImputer (use n_neighbors=5 by default or any other number of neighbors)
imputer = KNNImputer(n_neighbors=5, missing_values=0)

# Perform KNN Imputation (fit and transform the data)
imputed_data = imputer.fit_transform(data_subset)

# Convert imputed data back into a DataFrame
data_imputed = pd.DataFrame(imputed_data, columns=features_for_imputation)

# Round the imputed job values to the nearest integer (as job is categorical)
data_imputed['education'] = np.round(data_imputed['education']).astype(int)

# Use inverse_transform to convert the numerical codes back to original job categories
data_imputed['education'] = le.inverse_transform(data_imputed['education'])

# Check the imputed job column
print(data_imputed[['education']].head())

   education
0   tertiary
1  secondary
2  secondary
3  secondary
4  secondary


## Job

## Contact

## pOutcome

## One-hot encoding for categorical variables

Categorical: job, education, marital, contact, poutcome


In [9]:
# One-hot encoding for categorical variables
one_hot_data = pd.get_dummies(X, drop_first=True, columns=['job', 'education', 'marital', 'contact', 'poutcome'])
one_hot_data.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day_of_week', 'month',
       'duration', 'campaign', 'pdays', 'previous', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'education_secondary', 'education_tertiary',
       'marital_married', 'marital_single', 'contact_telephone',
       'poutcome_other', 'poutcome_success'],
      dtype='object')