# Data merging

### Import required packages

In [9]:
import pandas as pd
import numpy as np
import warnings

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder

warnings.filterwarnings("ignore")

## Merging 1

In [10]:
df1 = pd.read_csv("data/bank-full.csv", sep = ";", header = 0) #from UCI Bank Marketing
df2 = pd.read_csv("data/TotalLoanstoNonBankCustomersbyType.csv") #from data.gov Total Loans to Non-Bank Customers by Type

In [11]:
df1.insert(0, 'CustomerID', df1.index)
df1 = df1.rename(columns={'y': 'subscribed_to_term_deposit'})

df2_filtered = df2[df2['level_1'] == 'Consumer']
df2_subset = df2_filtered[['level_2', 'total_loans']]
df2_subset = df2_subset.rename(columns={'level_2': 'loan_category'})

merged_df = pd.concat([df1, df2_subset], axis=1)
merged_df.shape
#df1

(45211, 20)

Separate numerical_columns and categorical_columns, as we'll be dealing with missing data in them differently.

In [12]:
numerical_columns = ['CustomerID','age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_columns = ['job', 'marital', 'education', 'default', 'housing','loan', 'contact', 'day','month', 'poutcome', 'total_loans','loan_category', 'subscribed_to_term_deposit']
all_columns = numerical_columns + categorical_columns

### Filling in missing data
Use different techniques to "fill in missing data"   
Imputers will generate synthetic data based on existing features and use it to fill up the empty cells.
Below, I used IterativeImputer for numerical data and SimpleImputer("most_frequent") for categorical data. 

In [13]:
num_pipeline = Pipeline(
                steps=[
                    ("imputer", IterativeImputer(random_state=0)), # (Multivariate Imputation)
                    # Some examples of other imputation methods:
                    #   ("imputer", SimpleImputer(strategy='mean')), 
                    #   ("imputer", SimpleImputer(strategy='median')), 
                    #   ("imputer", SimpleImputer(strategy='most_frequent')), 
                    ("scaler", MinMaxScaler()), # Scaling numerical data
                ]
            )

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing categorical data
    ]
)

In [14]:
preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns),
                ]
            )

# Apply transformation on dataset
processed_data = preprocessor.fit_transform(merged_df)

# Convert processed_data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=all_columns)

# Convert numerical columns back to float
processed_df[numerical_columns] = processed_df[numerical_columns].apply(pd.to_numeric)

processed_df.shape
processed_df.head()

Unnamed: 0,CustomerID,age,balance,duration,campaign,pdays,previous,job,marital,education,default,housing,loan,contact,day,month,poutcome,total_loans,loan_category,subscribed_to_term_deposit
0,0.0,0.519481,0.092259,0.05307,0.0,0.0,0.0,management,married,tertiary,no,yes,no,unknown,5,may,unknown,1099.7,Housing and Bridging Loans,no
1,2.2e-05,0.337662,0.073067,0.030704,0.0,0.0,0.0,technician,single,secondary,no,yes,no,unknown,5,may,unknown,1099.7,Housing and Bridging Loans,no
2,4.4e-05,0.194805,0.072822,0.015453,0.0,0.0,0.0,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,unknown,1099.7,Housing and Bridging Loans,no
3,6.6e-05,0.376623,0.086476,0.018707,0.0,0.0,0.0,blue-collar,married,unknown,no,yes,no,unknown,5,may,unknown,1099.7,Housing and Bridging Loans,no
4,8.8e-05,0.194805,0.072812,0.04026,0.0,0.0,0.0,unknown,single,unknown,no,no,no,unknown,5,may,unknown,1099.7,Housing and Bridging Loans,no


merged_df.shape should equal to processed_df.shape

### Check if distribution is preserved

### eg. Kolmogorov-Smirnov Test for Numerical Columns
#### Interpretation  
- **KS Statistic**: A KS statistic of 0.0 indicates that there is no difference between the distributions of the original and processed data for each column.  
- **P-value**: A p-value of 1.0 means that the test results are consistent with the null hypothesis, which states that the distributions of the two datasets are the same.  

In [15]:
from scipy.stats import ks_2samp

# Kolmogorov-Smirnov test to check if two distributions are the same

def ks_test_column(original_column, processed_column):
    # Drop any missing values from original column
    original_non_missing = original_column.dropna()
    # Kolmogorov-Smirnov test
    ks_stat, p_value = ks_2samp(original_non_missing, processed_column)
    print(f"Kolmogorov-Smirnov test for \033[96m{original_column.name}\033[00m:")
    print(f"KS Statistic: {ks_stat}, p-value: {p_value}")
    return ks_stat, p_value

# Apply the KS test to all numerical columns
for col in numerical_columns:
    ks_test_column(merged_df[col], processed_df[col])

Kolmogorov-Smirnov test for [96mCustomerID[00m:
KS Statistic: 0.9999557629780363, p-value: 0.0
Kolmogorov-Smirnov test for [96mage[00m:
KS Statistic: 1.0, p-value: 0.0
Kolmogorov-Smirnov test for [96mbalance[00m:
KS Statistic: 0.8389551215412178, p-value: 0.0
Kolmogorov-Smirnov test for [96mduration[00m:
KS Statistic: 0.9999115259560726, p-value: 0.0
Kolmogorov-Smirnov test for [96mcampaign[00m:
KS Statistic: 0.9999778814890181, p-value: 0.0
Kolmogorov-Smirnov test for [96mpdays[00m:
KS Statistic: 0.8173674548229414, p-value: 0.0
Kolmogorov-Smirnov test for [96mprevious[00m:
KS Statistic: 0.18261042666607674, p-value: 0.0


#### Summary  
The KS test results suggest that the transformations applied to the numerical columns in your dataset did not alter their distributions. This outcome implies that the preprocessing steps (including scaling or imputation) did not change the fundamental distribution of the data in each column. Therefore, the original and processed data distributions are effectively identical for these columns.


### Chi-Square Test for Categorical Columns

#### Interpretation
- **Chi-Square Statistic**: Measures the magnitude of the difference between observed and expected frequencies. A higher value indicates a greater difference.    
- **P-value**: Indicates the probability of observing the data if the null hypothesis (that the distributions are the same) is true. A low p-value (typically < 0.05) suggests that there is a significant difference between the distributions.

In [16]:

from scipy.stats import chi2_contingency

def chi2_test_column(original_column, processed_column):
    # Create contingency table
    contingency_table = pd.crosstab(original_column, processed_column)
    
    # Perform Chi-Square Test
    chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
    
    print(f"Chi-Square test for \033[96m{original_column.name}\033[00m:")
    print(f"Chi-Square Statistic: {chi2_stat}, p-value: {p_value}")
    return chi2_stat, p_value

# Apply the Chi-Square test to all categorical columns
for col in categorical_columns:
    chi2_test_column(merged_df[col], processed_df[col])

Chi-Square test for [96mjob[00m:
Chi-Square Statistic: 497321.0, p-value: 0.0
Chi-Square test for [96mmarital[00m:
Chi-Square Statistic: 90422.0, p-value: 0.0
Chi-Square test for [96meducation[00m:
Chi-Square Statistic: 135633.0, p-value: 0.0
Chi-Square test for [96mdefault[00m:
Chi-Square Statistic: 45154.52566979986, p-value: 0.0
Chi-Square test for [96mhousing[00m:
Chi-Square Statistic: 45206.949574261984, p-value: 0.0
Chi-Square test for [96mloan[00m:
Chi-Square Statistic: 45203.568343050356, p-value: 0.0
Chi-Square test for [96mcontact[00m:
Chi-Square Statistic: 90422.0, p-value: 0.0
Chi-Square test for [96mday[00m:
Chi-Square Statistic: 1356330.0, p-value: 0.0
Chi-Square test for [96mmonth[00m:
Chi-Square Statistic: 497321.0, p-value: 0.0
Chi-Square test for [96mpoutcome[00m:
Chi-Square Statistic: 135633.0, p-value: 0.0
Chi-Square test for [96mtotal_loans[00m:
Chi-Square Statistic: 472650.00000000035, p-value: 0.0002096284415473288
Chi-Square test for [96ml

**Summary**  
After running the Chi-Square Test, you will be able to determine if the categorical distributions in your original and processed datasets are statistically significantly different from each other.

## Merging 2
.... continueeeee 