# Data merging

### Import required packages

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder

warnings.filterwarnings("ignore")

## Merging 1

In [2]:
df1 = pd.read_csv("data/bank-full.csv", sep = ";", header = 0) #from UCI
df2 = pd.read_csv("data/TotalLoanstoNonBankCustomersbyType.csv") #from data.gov

In [9]:
df1['CustomerID'] = df1.index
df1 = df1.rename(columns={'y': 'subscribed_to_term_deposit'})

df2_filtered = df2[df2['level_1'] == 'Consumer']
df2_subset = df2_filtered[['level_2', 'total_loans']]
df2_subset = df2_subset.rename(columns={'level_2': 'loan_category'})

merged_df = pd.concat([df1, df2_subset], axis=1)
merged_df.shape

(45211, 20)

Separate numerical_columns and categorical_columns, as we'll be dealing with missing data in them differently.

In [4]:
numerical_columns = ['CustomerID','age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_columns = ['job', 'marital', 'education', 'default', 'housing','loan', 'contact', 'day','month', 'poutcome', 'total_loans','loan_category', 'subscribed_to_term_deposit']
all_columns = numerical_columns + categorical_columns

### Filling in missing data
Use different techniques to "fill in missing data"   
Imputers will generate synthetic data based on existing features and use it to fill up the empty cells.
Below, I used IterativeImputer for numerical data and SimpleImputer("most_frequent") for categorical data. 

In [5]:
num_pipeline = Pipeline(
                steps=[
                    ("imputer", IterativeImputer(random_state=0)), # Iterative imputation for numerical data
                    ("scaler", MinMaxScaler()), # Scaling numerical data
                ]
            )

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing categorical data
    ]
)

In [6]:
preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns),
                ]
            )

# Apply transformation on dataset
processed_data = preprocessor.fit_transform(merged_df)

# Convert processed_data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=all_columns)

processed_data.shape

(45211, 20)

merged_df.shape should equal to processed_df.shape