In [7]:
import pandas as pd
import numpy as np
import warnings

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder

warnings.filterwarnings("ignore")

def preprocess_data(
		PATHS = ['data/bank-full.csv', "data/Churn_Modelling.csv"],
		num_pipeline = Pipeline(steps=[("imputer", IterativeImputer(random_state=0)),]),
		cat_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),]),
		label = 'subscribed'
	):

	df1 = pd.read_csv(PATHS[0], sep = ";", header = 0) #from UCI Bank Marketing
	df2 = pd.read_csv(PATHS[1]).iloc[:, 3:] 
	# print(df2.head())

	df1.rename({i:i.lower() for i in df2.columns.values}, axis=1, inplace=True)
	df2.rename({i:i.lower() for i in df2.columns.values}, axis=1, inplace=True)

	#
	dtype_dict = pd.DataFrame(pd.concat([df1.dtypes, (df2.dtypes)], axis=0))
	dtype_dict = dtype_dict.T.loc[:, ~dtype_dict.T.columns.duplicated()].T.copy().iloc[:, 0]

	# Merge Dataframes
	merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

	# Find numerical & categorical columns
	which_object = [i == np.dtype('O') for i in merged_df.dtypes]
	categorical_columns = merged_df.columns[which_object].values
	numerical_columns = merged_df.columns[np.invert(which_object)].values
	all_columns = np.concatenate([numerical_columns, categorical_columns])

	# Rearrange column sequence
	merged_df = merged_df.loc[:, all_columns]
	merged_df.reset_index(drop=True)
	# merged_df[categorical_columns] = merged_df.loc[:, categorical_columns].astype('category')
	
	# 
	cat_dtypes = merged_df.dtypes[categorical_columns]
	num_dtypes = dtype_dict[numerical_columns]
	dtype_dict = dict(num_dtypes) |  dict(cat_dtypes)

	preprocessor = ColumnTransformer(transformers=[
			("num_pipeline", num_pipeline, numerical_columns),
			("cat_pipeline", cat_pipeline, categorical_columns),
	])
	# Apply transformation on dataset
	processed_data = preprocessor.fit_transform(merged_df)

	# Convert processed_data back to a DataFrame
	processed_df = pd.DataFrame(processed_data, columns=all_columns)

	# Convert numerical columns back to float
	processed_df.loc[:, numerical_columns] = processed_df[numerical_columns].apply(pd.to_numeric)
	processed_df.loc[:, categorical_columns] = processed_df[categorical_columns].astype('category')
	# print(processed_df.loc[:, categorical_columns[0]])

	feat_cols = [i for i in processed_df.columns if (i != 'subscribed' and i != 'exited')]

	X = processed_df[feat_cols]
	y = processed_df[[label]]

	return X, y

X, y = preprocess_data()
X.info(), y.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55211 entries, 0 to 55210
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              55211 non-null  object
 1   balance          55211 non-null  object
 2   day              55211 non-null  object
 3   duration         55211 non-null  object
 4   campaign         55211 non-null  object
 5   pdays            55211 non-null  object
 6   previous         55211 non-null  object
 7   creditscore      55211 non-null  object
 8   tenure           55211 non-null  object
 9   numofproducts    55211 non-null  object
 10  estimatedsalary  55211 non-null  object
 11  job              55211 non-null  object
 12  marital          55211 non-null  object
 13  education        55211 non-null  object
 14  default          55211 non-null  object
 15  housing          55211 non-null  object
 16  loan             55211 non-null  object
 17  contact          55211 non-null

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,creditscore,tenure,numofproducts,...,default,housing,loan,contact,month,poutcome,geography,gender,hascrcard,isactivemember
0,58.0,2143.0,5.0,261.0,1.0,-1.0,0.0,629.623328,2.51044,-2.143939,...,no,yes,no,unknown,may,unknown,France,Male,yes,yes
1,44.0,29.0,5.0,151.0,1.0,-1.0,0.0,629.623328,2.51044,-2.143938,...,no,yes,no,unknown,may,unknown,France,Male,yes,yes
2,33.0,2.0,5.0,76.0,1.0,-1.0,0.0,629.623328,2.51044,-2.143938,...,no,yes,yes,unknown,may,unknown,France,Male,yes,yes
3,47.0,1506.0,5.0,92.0,1.0,-1.0,0.0,629.623328,2.51044,-2.143938,...,no,yes,no,unknown,may,unknown,France,Male,yes,yes
4,33.0,1.0,5.0,198.0,1.0,-1.0,0.0,629.623328,2.51044,-2.143938,...,no,no,no,unknown,may,unknown,France,Male,yes,yes
