In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor

In [41]:
df = pd.read_csv('/Users/an-chilu/Desktop/Client_Asset_Allocation_Model/data/Bank_Marketing_Dataset.csv')
df.head(5)

Unnamed: 0,ClientID,Age,Gender,MaritalStatus,EducationLevel,EmploymentStatus,JobTitle,Region,SalaryCategory,CustomerSegment,...,LastContactDuration,NumContactsInCampaign,NumPrevCampaignContacts,PrevCampaignOutcome,CallResponseScore,DaysSinceLastContact,PreviousYearDeposit,MarketingScore,ResponsePropensity,TermDepositSubscribed
0,1,46,Male,Divorced,Secondary,Employed,Services,East,Mid,Mass,...,343,4,1,Failure,25.07,475,0,1.7179,0.21,0
1,2,39,Female,Married,Secondary,Self-employed,Blue-collar,Central,Low,Premium,...,198,2,0,Nonexistent,29.11,-1,0,1.6803,0.46,1
2,3,48,Male,Married,Secondary,Self-employed,Services,South,Mid,Mass,...,177,4,4,Failure,15.76,318,0,1.051,0.22,1
3,4,59,Female,Married,Secondary,Employed,Student,South,Low,Mass,...,132,1,1,Success,40.06,819,0,1.5676,0.3,0
4,5,38,Male,Married,Tertiary,Employed,Retired,North,Low,Mass,...,376,4,1,Failure,26.51,362,0,1.2918,0.33,0


#### Check if there's missing data

In [42]:
df.isnull().sum().sum()

np.int64(0)

# I. Data Preprocessing 
Since the dataset is based on the Bank Marketing Dataset for Term Deposit Prediction, it contains a wide range of customer demographic and financial information that is also relevant for client portfolio allocation. Therefore, we first filter out non-relevant variables and retain only those features that are meaningful for modeling investment preferences, tailoring the dataset to the specific objectives of our robo-advisor framework.

In [43]:
df = df.drop(columns = ['ClientID', 'LastContactChannel',
'LastContactMonth',
'LastContactDay',
'LastContactDuration',
'NumContactsInCampaign',
'NumPrevCampaignContacts',
'PrevCampaignOutcome',
'CallResponseScore',
'DaysSinceLastContact',
'PreviousYearDeposit',
'MarketingScore',
'ResponsePropensity',
'TermDepositSubscribed', 'JobTitle', 'BranchVisitFrequency',
'TotalTransactions',
'AvgTransactionValue',
'HasLifeInsurance',
'HasPersonalLoan',
'HasMortgage',
'HasCreditCard'
])
df.dtypes

Age                           int64
Gender                       object
MaritalStatus                object
EducationLevel               object
EmploymentStatus             object
Region                       object
SalaryCategory               object
CustomerSegment              object
AnnualIncome                float64
NetWorth                    float64
CreditScore                   int64
CreditLimit                 float64
RiskRating                   object
AccountLengthYears            int64
TenureWithBank                int64
AccountBalance              float64
NumBankProducts               int64
HasMutualFunds               object
InvestmentPortfolioValue    float64
NumOnlineTransactions         int64
NumMobileAppLogins            int64
ChannelPreference            object
WebsiteActivityScore        float64
dtype: object

In [44]:
# df = df.sample(frac=0.5, random_state=42).reset_index(drop=True).to_csv('preprocessed_clients.csv', index=False)

In [45]:
df = pd.read_csv('/Users/an-chilu/Desktop/Client_Asset_Allocation_Model/preprocessed_clients_with_asset_weights.csv')
df.rename(columns={
    "Alternatives": "REIT"
}, inplace=True)
df_col = df.columns.tolist()
df_col[-4] = "International_Equity"
df.columns = df_col
df.head(5)

Unnamed: 0,Age,Gender,MaritalStatus,EducationLevel,EmploymentStatus,Region,SalaryCategory,CustomerSegment,AnnualIncome,NetWorth,...,InvestmentPortfolioValue,NumOnlineTransactions,NumMobileAppLogins,ChannelPreference,WebsiteActivityScore,US_Equity,International_Equity,Bonds,Cash,REIT
0,44,Female,Single,Secondary,Employed,Metro,UpperMid,Standard,107451.06,160417.89,...,52978.4,156,136,Digital,20.76,0.288414,0.285054,0.174688,0.079298,0.172546
1,64,Female,Single,Secondary,Employed,South,Mid,Mass,39397.86,58719.17,...,23031.86,110,184,Branch,19.5,0.213492,0.139645,0.35042,0.157198,0.139245
2,56,Male,Married,Tertiary,Employed,South,Low,Mass,18737.27,25896.31,...,11961.9,8,72,Digital,17.5,0.150559,0.194532,0.31455,0.21789,0.122469
3,18,Male,Single,Tertiary,Retired,Metro,High,Mass,136901.46,21793.65,...,8726.06,50,74,Digital,8.32,0.416417,0.129622,0.312603,0.098984,0.042374
4,44,Male,Married,Secondary,Employed,Metro,Low,Mass,24106.28,20364.32,...,17584.55,61,70,Digital,8.46,0.312527,0.060208,0.300265,0.203247,0.123752


#### Examine the columns with 'Object' values.

In [46]:
for i in df.select_dtypes(include="object").columns:
    print(f'{i}:' , f'{df[i].unique()}')

Gender: ['Female' 'Male']
MaritalStatus: ['Single' 'Married' 'Divorced' 'Widowed']
EducationLevel: ['Secondary' 'Tertiary' 'Unknown' 'Primary']
EmploymentStatus: ['Employed' 'Retired' 'Student' 'Self-employed' 'Unemployed']
Region: ['Metro' 'South' 'East' 'Central' 'North' 'West']
SalaryCategory: ['UpperMid' 'Mid' 'Low' 'High']
CustomerSegment: ['Standard' 'Mass' 'Premium' 'Private']
RiskRating: ['High' 'Medium' 'Low']
HasMutualFunds: ['No' 'Yes']
ChannelPreference: ['Digital' 'Branch' 'Hybrid']


#### Some columns are considered ordinal instead of categorical variables, so we map each level with values then encode the rest.

In [47]:
ordinal_maps = {
    'EducationLevel':{'Unknown': 0, 'Primary': 1, 'Secondary': 2, 'Tertiary': 3},
    'SalaryCategory': {'Low': 1, 'Mid': 2, 'UpperMid': 3, 'High': 4},
    'CustomerSegment': {'Mass': 1, 'Standard': 2, 'Premium': 3, 'Private': 4},
    'RiskRating': {'Low': 1, 'Medium': 2, 'High': 3}
}
for col, mapping in ordinal_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

categorical_list = ['Gender', 'MaritalStatus', 'EmploymentStatus', 'Region', 'HasMutualFunds', 'ChannelPreference']

df_encoded = pd.get_dummies(
    df,
    columns=categorical_list,
    drop_first=True,
    dtype=int
)

In [48]:
df_encoded.head()

Unnamed: 0,Age,EducationLevel,SalaryCategory,CustomerSegment,AnnualIncome,NetWorth,CreditScore,CreditLimit,RiskRating,AccountLengthYears,...,EmploymentStatus_Student,EmploymentStatus_Unemployed,Region_East,Region_Metro,Region_North,Region_South,Region_West,HasMutualFunds_Yes,ChannelPreference_Digital,ChannelPreference_Hybrid
0,44,2,3,2,107451.06,160417.89,564,55530.01,3,0,...,0,0,0,1,0,0,0,0,1,0
1,64,2,2,1,39397.86,58719.17,557,28159.73,3,5,...,0,0,0,0,0,1,0,0,0,0
2,56,3,1,1,18737.27,25896.31,631,18934.17,2,0,...,0,0,0,0,0,1,0,1,1,0
3,18,3,4,1,136901.46,21793.65,570,55729.7,3,2,...,0,0,0,1,0,0,0,0,1,0
4,44,2,1,1,24106.28,20364.32,602,17062.65,3,3,...,0,0,0,1,0,0,0,0,1,0


# Train-Test Split
#### Training Set: 80% of data

#### Testing Set: 20% of data

In [49]:
X = df_encoded.drop(columns = ['US_Equity', 'International_Equity', 'Bonds', 'REIT', 'Cash'])
Y = df_encoded[['US_Equity', 'International_Equity', 'Bonds', 'REIT', 'Cash']]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# II. Model Prediction

## A. Linear Regression

### 1. Baseline Linear Regression

In [50]:
# standardize x_train 
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

# baseline linear regression
base_linear_model = LinearRegression()
base_linear_model.fit(x_train_scaled, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


### 2. Upgraded Linear Regression - Lasso, Ridge, ElasticNet

In [51]:
# Lasso 
lasso_linear_model = MultiOutputRegressor(Lasso(alpha=0.01))
lasso_linear_model.fit(x_train_scaled, y_train)

# Ridge
ridge_linear_model = Ridge(alpha=1.0)
ridge_linear_model.fit(x_train_scaled , y_train)

# ElasticNet
elasticnet_linear_model = MultiOutputRegressor(
    ElasticNet(alpha=0.01, l1_ratio=0.3)
)
elasticnet_linear_model.fit(x_train_scaled, y_train)

0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`.,ElasticNet(al... l1_ratio=0.3)
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",

0,1,2
,"alpha  alpha: float, default=1.0 Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter. ``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.",0.01
,"l1_ratio  l1_ratio: float, default=0.5 The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.",0.3
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.",True
,"precompute  precompute: bool or array-like of shape (n_features, n_features), default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity. Check :ref:`an example on how to use a precomputed Gram Matrix in ElasticNet ` for details.",False
,"max_iter  max_iter: int, default=1000 The maximum number of iterations.",1000
,"copy_X  copy_X: bool, default=True If ``True``, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-4 The tolerance for the optimization: if the updates are smaller or equal to ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller or equal to ``tol``, see Notes below.",0.0001
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.",False
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive.",False
,"random_state  random_state: int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",


### Performance Evaluation of Linear Models

In [52]:
# A function to do models' performance evaluation
def evaluate_models_overall(models, X, Y, prefix=""):
    rows = []
    for name, model in models.items():
        yhat = model.predict(X)
        rows.append({
            "Model": name,
            f"{prefix}R²": r2_score(Y, yhat, multioutput="uniform_average"),
            f"{prefix}RMSE": root_mean_squared_error(Y, yhat, multioutput="uniform_average")
        })
    return pd.DataFrame(rows)

In [53]:
# Training dataset evaluation on all linear models
models_linear = {
    "Linear": base_linear_model,
    "Lasso": lasso_linear_model,
    "Ridge": ridge_linear_model,
    "ElasticNet": elasticnet_linear_model
}
linear_model_train_performance_all = evaluate_models_overall(
    models_linear, x_train_scaled, y_train, prefix="Train "
)
linear_model_train_performance_all

Unnamed: 0,Model,Train R²,Train RMSE
0,Linear,0.325172,0.053122
1,Lasso,0.222244,0.057012
2,Ridge,0.325172,0.053122
3,ElasticNet,0.304452,0.05392


In [54]:
# Testing dataset evaluation on all linear models
x_test_scaled = scaler.transform(x_test)
linear_model_test_performance_all = evaluate_models_overall(
    models_linear, x_test_scaled, y_test, prefix="Test "
)
linear_model_test_performance_all

Unnamed: 0,Model,Test R²,Test RMSE
0,Linear,0.328007,0.053034
1,Lasso,0.222414,0.057025
2,Ridge,0.328007,0.053034
3,ElasticNet,0.306429,0.053861


### Feature Importance

In [55]:
# Linear Baseline Model feature importance 
linear_coef = base_linear_model.coef_
linear_importance = (
    pd.DataFrame(linear_coef, columns=x_train.columns)
    .abs()
    .mean(axis=0)
    .sort_values(ascending=False)
)

# Ridge Linear Model feature importance 
asset_names = ["US_Equity", "International_Equity", "Bonds", "REIT", "Cash"]
ridge_coef = ridge_linear_model.coef_          # shape: (5, n_features)
ridge_coef_df = pd.DataFrame(ridge_coef, index=asset_names, columns=x_train.columns)
ridge_feature_overall_importance = ridge_coef_df.abs().mean(axis=0).sort_values(ascending=False)

# feature importance dataframe - baseline linear, ridge linear
linear_feature_importance_df = pd.concat(
    [
        linear_importance.rename("Base Linear"),
        ridge_feature_overall_importance.rename("Ridge")
    ],
    axis=1
)

linear_feature_importance_df = linear_feature_importance_df.sort_values(
    by="Ridge", ascending=False
)

linear_feature_importance_df.head(15)

Unnamed: 0,Base Linear,Ridge
CustomerSegment,0.015306,0.015304
Age,0.014455,0.014454
NumMobileAppLogins,0.006564,0.006564
EducationLevel,0.006387,0.006386
AnnualIncome,0.006036,0.006034
SalaryCategory,0.005945,0.005945
WebsiteActivityScore,0.005416,0.005416
ChannelPreference_Digital,0.004429,0.004428
NumOnlineTransactions,0.004422,0.004421
ChannelPreference_Hybrid,0.004231,0.00423


## B. Gradient Boosting

In [56]:
# Gradient Boosting Model
RANDOM_STATE = 42
gb = HistGradientBoostingRegressor(max_iter=200, random_state=RANDOM_STATE)
multi_gb_model = MultiOutputRegressor(gb)

# Fit GB
multi_gb_model.fit(x_train, y_train)
model_gb = {"Gradient_Boosing": multi_gb_model}
evaluate_models_overall(model_gb, x_train, y_train, prefix="Train ")

Unnamed: 0,Model,Train R²,Train RMSE
0,Gradient_Boosing,0.420486,0.049266


In [57]:
evaluate_models_overall(model_gb, x_test, y_test, prefix="Test ")

Unnamed: 0,Model,Test R²,Test RMSE
0,Gradient_Boosing,0.376301,0.051116


### Feature Importance

In [58]:
# feature_names = x_train.columns
# models = multi_gb_model.estimators_

# asset_names = ["US_Equity", "Intl_Equity", "Bonds", "REITs", "Cash"]



# y_test_temp = y_test.to_numpy()
# for i, asset in enumerate(asset_names):
#     r = permutation_importance(
#         models[i],
#         x_test,
#         y_test_temp[:, i],
#         scoring=mae_scorer,
#         n_repeats=30,
#         random_state=42
#     )

#     fi = pd.Series(
#         r.importances_mean,
#         index=feature_names
#     ).sort_values(ascending=False)

#     print(f"\nTop features for {asset}:")
#     print(fi.head(10))

## C. Random Forest

### Feature Importance

## D. XGBoost

In [59]:
xgb_base = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=800,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42
)

multi_xgb = MultiOutputRegressor(xgb_base)

multi_xgb.fit(x_train, y_train)

model_xgboost = {"XGBoost": multi_xgb}

evaluate_models_overall(model_xgboost, x_train, y_train, prefix="Train ")

Unnamed: 0,Model,Train R²,Train RMSE
0,XGBoost,0.481846,0.04657


In [60]:
evaluate_models_overall(model_xgboost, x_test, y_test, prefix="Test ")

Unnamed: 0,Model,Test R²,Test RMSE
0,XGBoost,0.371933,0.051298


### Feature Importance

In [61]:
est0 = multi_xgb.estimators_[0]          
booster0 = est0.get_booster()       

gain0 = booster0.get_score(importance_type="gain")
feature_names = x_train.columns  

all_gain = {}
for j, est in enumerate(multi_xgb.estimators_):
    gain = est.get_booster().get_score(importance_type="gain")
    s = pd.Series(gain).reindex(feature_names).fillna(0.0)
    all_gain[f"target_{j}"] = s

gain_df = pd.DataFrame(all_gain).sort_values(by=list(all_gain.keys()), ascending=False)
gain_df.head(15)


Unnamed: 0,target_0,target_1,target_2,target_3,target_4
EducationLevel,0.296162,0.499893,0.008839,0.003237,0.006095
CustomerSegment,0.19017,0.508157,0.672836,0.348567,0.048148
Age,0.118908,0.046069,0.194938,0.00455,0.013681
SalaryCategory,0.03514,0.005814,0.008395,0.004586,1.70221
ChannelPreference_Hybrid,0.033187,0.011746,0.00866,0.003458,0.062429
EmploymentStatus_Retired,0.020158,0.012315,0.04211,0.00279,0.004359
AnnualIncome,0.018767,0.010826,0.010689,0.01663,0.169256
HasMutualFunds_Yes,0.018607,0.007141,0.030039,0.072176,0.010466
ChannelPreference_Digital,0.017917,0.009615,0.00969,0.003196,0.043771
EmploymentStatus_Unemployed,0.014901,0.006156,0.027609,0.00267,0.003786


In [62]:
asset_names = list(y_train.columns)
feature_names = x_train.columns

gain_by_asset = {}

for asset, est in zip(asset_names, multi_xgb.estimators_):
    gain = est.get_booster().get_score(importance_type="gain")
    gain_by_asset[asset] = (
        pd.Series(gain)
        .reindex(feature_names)
        .fillna(0.0)
    )

gain_df = pd.DataFrame(gain_by_asset)

gain_df.head(15)


Unnamed: 0,US_Equity,International_Equity,Bonds,REIT,Cash
Age,0.118908,0.046069,0.194938,0.00455,0.013681
EducationLevel,0.296162,0.499893,0.008839,0.003237,0.006095
SalaryCategory,0.03514,0.005814,0.008395,0.004586,1.70221
CustomerSegment,0.19017,0.508157,0.672836,0.348567,0.048148
AnnualIncome,0.018767,0.010826,0.010689,0.01663,0.169256
NetWorth,0.01446,0.011739,0.024396,0.205359,0.037189
CreditScore,0.00772,0.005561,0.010924,0.003266,0.004945
CreditLimit,0.010202,0.00653,0.009164,0.004186,0.005379
RiskRating,0.00755,0.005474,0.011954,0.003421,0.004053
AccountLengthYears,0.007702,0.005143,0.011285,0.003346,0.004708


# Visualization

In [63]:
df

Unnamed: 0,Age,Gender,MaritalStatus,EducationLevel,EmploymentStatus,Region,SalaryCategory,CustomerSegment,AnnualIncome,NetWorth,...,InvestmentPortfolioValue,NumOnlineTransactions,NumMobileAppLogins,ChannelPreference,WebsiteActivityScore,US_Equity,International_Equity,Bonds,Cash,REIT
0,44,Female,Single,2,Employed,Metro,3,2,107451.06,160417.89,...,52978.40,156,136,Digital,20.76,0.288414,0.285054,0.174688,0.079298,0.172546
1,64,Female,Single,2,Employed,South,2,1,39397.86,58719.17,...,23031.86,110,184,Branch,19.50,0.213492,0.139645,0.350420,0.157198,0.139245
2,56,Male,Married,3,Employed,South,1,1,18737.27,25896.31,...,11961.90,8,72,Digital,17.50,0.150559,0.194532,0.314550,0.217890,0.122469
3,18,Male,Single,3,Retired,Metro,4,1,136901.46,21793.65,...,8726.06,50,74,Digital,8.32,0.416417,0.129622,0.312603,0.098984,0.042374
4,44,Male,Married,2,Employed,Metro,1,1,24106.28,20364.32,...,17584.55,61,70,Digital,8.46,0.312527,0.060208,0.300265,0.203247,0.123752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,26,Male,Married,3,Self-employed,North,2,1,42759.72,13503.85,...,20442.07,63,105,Branch,10.58,0.137396,0.175984,0.348343,0.231607,0.106670
49996,57,Male,Widowed,1,Employed,Central,2,2,36176.75,61144.25,...,28139.62,63,121,Digital,12.75,0.372176,0.140169,0.299491,0.112861,0.075302
49997,30,Male,Single,3,Student,West,1,1,28116.90,10221.73,...,17932.36,75,110,Hybrid,10.10,0.260902,0.153280,0.309357,0.195068,0.081393
49998,48,Female,Widowed,2,Retired,Metro,2,2,47062.69,51568.35,...,33061.71,56,193,Digital,18.31,0.213133,0.119425,0.400314,0.137519,0.129609


In [65]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Compute correlation matrix
corr_df = df.drop(columns = ['US_Equity', 'International_Equity', 'Bonds', 'REIT', 'Cash'])
corr = corr_df.corr(numeric_only=True)

corr_df_encoded = pd.get_dummies(
    corr_df,
    columns=categorical_list,
    drop_first=False,
    dtype=int
)

In [66]:
corr_df_encoded

Unnamed: 0,Age,EducationLevel,SalaryCategory,CustomerSegment,AnnualIncome,NetWorth,CreditScore,CreditLimit,RiskRating,AccountLengthYears,...,Region_East,Region_Metro,Region_North,Region_South,Region_West,HasMutualFunds_No,HasMutualFunds_Yes,ChannelPreference_Branch,ChannelPreference_Digital,ChannelPreference_Hybrid
0,44,2,3,2,107451.06,160417.89,564,55530.01,3,0,...,0,1,0,0,0,1,0,0,1,0
1,64,2,2,1,39397.86,58719.17,557,28159.73,3,5,...,0,0,0,1,0,1,0,1,0,0
2,56,3,1,1,18737.27,25896.31,631,18934.17,2,0,...,0,0,0,1,0,0,1,0,1,0
3,18,3,4,1,136901.46,21793.65,570,55729.70,3,2,...,0,1,0,0,0,1,0,0,1,0
4,44,2,1,1,24106.28,20364.32,602,17062.65,3,3,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,26,3,2,1,42759.72,13503.85,656,27512.37,2,8,...,0,0,1,0,0,1,0,1,0,0
49996,57,1,2,2,36176.75,61144.25,550,18821.86,3,6,...,0,0,0,0,0,1,0,0,1,0
49997,30,3,1,1,28116.90,10221.73,640,17783.85,2,10,...,0,0,0,0,1,1,0,0,0,1
49998,48,2,2,2,47062.69,51568.35,521,27919.50,3,14,...,0,1,0,0,0,1,0,0,1,0


In [None]:
# plt.figure(figsize=(12, 8))
# sns.heatmap(
#     corr_df_encoded,
#     annot=True,           # show correlation values
#     cmap='coolwarm',      # color scheme
#     fmt='.2f',            # format values
#     linewidths=0.5,
#     square=True
# )
# plt.title("Correlation Heatmap of Features")
# plt.show()