In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder

from sklearn.pipeline import Pipeline

from sdv.datasets.demo import download_demo

warnings.filterwarnings("ignore")

def preprocess_data(
		PATHS = ['../data/bank-full.csv', "../data/churn-modelling.csv"],
		SYNTHETIC = ['credit'],
		num_pipeline = Pipeline(steps=[("imputer", IterativeImputer(random_state=0)),]),
		cat_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),]),
		label = 'subscribed'
	):

	df1 = pd.read_csv(PATHS[0])
	df2 = pd.read_csv(PATHS[1])
	real_data, metadata = download_demo(
		modality='single_table',
		dataset_name='insurance'
	) 

	# df1.rename({i:i.lower() for i in df2.columns.values}, axis=1, inplace=True)
	# df2.rename({i:i.lower() for i in df2.columns.values}, axis=1, inplace=True)

	#
	dtype_dict = pd.DataFrame(pd.concat([df1.dtypes, (df2.dtypes)], axis=0))
	dtype_dict = dtype_dict.T.loc[:, ~dtype_dict.T.columns.duplicated()].T.copy().iloc[:, 0]

	# Merge Dataframes
	merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

	# Find numerical & categorical columns
	which_object = [i == np.dtype('O') for i in merged_df.dtypes]
	categorical_columns = merged_df.columns[which_object].values
	numerical_columns = merged_df.columns[np.invert(which_object)].values
	all_columns = np.concatenate([numerical_columns, categorical_columns])

	# Rearrange column sequence
	merged_df = merged_df.loc[:, all_columns]
	merged_df.reset_index(drop=True)
	merged_df[categorical_columns] = merged_df.loc[:, categorical_columns].astype('category')
	
	# 
	cat_dtypes = dict(merged_df.dtypes[categorical_columns])
	num_dtypes = dict(dtype_dict[numerical_columns])
	# print(dict(num_dtypes))
	dtype_dict = num_dtypes | cat_dtypes

	preprocessor = ColumnTransformer(transformers=[
			("num_pipeline", num_pipeline, numerical_columns),
			("cat_pipeline", cat_pipeline, categorical_columns),
	])
	# Apply transformation on dataset
	processed_data = preprocessor.fit_transform(merged_df)

	# Convert processed_data back to a DataFrame
	processed_df = pd.DataFrame(processed_data, columns=all_columns)

	# Convert numerical columns back to float
	processed_df.loc[:, numerical_columns] = processed_df[numerical_columns].apply(pd.to_numeric)

	# processed_df.loc[:, numerical_columns].astype(num_dtypes)
	processed_df = processed_df.astype(dtype_dict)
	processed_df.to_csv("../data/merged.csv")

	# feat_cols = [i for i in processed_df.columns if (i != 'subscribed' and i != 'exited' and i != 'customerid')]
	
	feat_cols = processed_df.columns.drop(['subscribed', 'exited', 'customerid', 'surname', 'isactivemember'])

	X = processed_df[feat_cols]
	y = processed_df[[label]]

	return X, y

X, y = preprocess_data()
X

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,creditscore,tenure,numofproducts,...,education,default,housing,loan,contact,month,poutcome,geography,gender,hascrcard
0,58,2143.00,5,261,1,-1,0,649,5,1,...,tertiary,no,yes,no,unknown,may,unknown,France,Male,yes
1,44,29.00,5,151,1,-1,0,649,5,1,...,secondary,no,yes,no,unknown,may,unknown,France,Male,yes
2,33,2.00,5,76,1,-1,0,649,5,1,...,secondary,no,yes,yes,unknown,may,unknown,France,Male,yes
3,47,1506.00,5,92,1,-1,0,649,5,1,...,unknown,no,yes,no,unknown,may,unknown,France,Male,yes
4,33,1.00,5,198,1,-1,0,649,5,1,...,unknown,no,no,no,unknown,may,unknown,France,Male,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55206,39,0.00,15,256,2,40,0,771,5,2,...,secondary,no,yes,no,cellular,may,unknown,France,Male,yes
55207,35,57369.61,17,358,1,37,1,516,10,1,...,secondary,no,yes,no,cellular,may,unknown,France,Male,yes
55208,36,0.00,15,256,2,41,0,709,7,1,...,secondary,no,yes,no,cellular,may,unknown,France,Female,no
55209,42,75075.31,17,388,1,35,1,772,3,2,...,secondary,no,yes,no,cellular,may,unknown,Germany,Male,yes
