# Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('bank_transactions.csv')
df = df.sample(n=50000, random_state=42)
df.head(10)

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
892845,T892846,C5725279,8/9/84,M,JASPUR,84447.82,7/9/16,183210,11858.0
444994,T444995,C4588538,2/1/94,M,GURGAON,12549.49,16/8/16,161300,250.0
614896,T614897,C2416476,14/11/90,M,NEW DELHI,33607.65,26/8/16,102007,3360.0
457036,T457037,C5940151,15/9/90,M,HYDERABAD,38238.86,21/8/16,110438,250.0
997441,T997442,C5922989,27/11/88,M,PURBO MEDINIPUR,9506.85,14/9/16,90810,33.0
518219,T518220,C8296884,31/10/89,M,BANGALORE,635863.63,19/8/16,215646,2688.0
778232,T778233,C6671851,1/1/1800,M,SHOPPING COMPLEX THIRUVANATHAPURAM,5867.38,6/9/16,193036,870.0
151835,T151836,C8927812,3/3/87,M,NEW DELHI,63911.56,4/8/16,30857,499.0
446575,T446576,C7511745,23/3/92,F,GURGAON,19853.15,16/8/16,183906,280.0
887527,T887528,C6126877,1/10/93,M,GURGAON,3848.86,7/9/16,201500,203.45


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 892845 to 314301
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            50000 non-null  object 
 1   CustomerID               50000 non-null  object 
 2   CustomerDOB              49836 non-null  object 
 3   CustGender               49945 non-null  object 
 4   CustLocation             49996 non-null  object 
 5   CustAccountBalance       49890 non-null  float64
 6   TransactionDate          50000 non-null  object 
 7   TransactionTime          50000 non-null  int64  
 8   TransactionAmount (INR)  50000 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 3.8+ MB


In [4]:
df.isnull().sum()

TransactionID                0
CustomerID                   0
CustomerDOB                164
CustGender                  55
CustLocation                 4
CustAccountBalance         110
TransactionDate              0
TransactionTime              0
TransactionAmount (INR)      0
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isnull().sum()

TransactionID              0
CustomerID                 0
CustomerDOB                0
CustGender                 0
CustLocation               0
CustAccountBalance         0
TransactionDate            0
TransactionTime            0
TransactionAmount (INR)    0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49669 entries, 892845 to 314301
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            49669 non-null  object 
 1   CustomerID               49669 non-null  object 
 2   CustomerDOB              49669 non-null  object 
 3   CustGender               49669 non-null  object 
 4   CustLocation             49669 non-null  object 
 5   CustAccountBalance       49669 non-null  float64
 6   TransactionDate          49669 non-null  object 
 7   TransactionTime          49669 non-null  int64  
 8   TransactionAmount (INR)  49669 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 3.8+ MB


### DOB to Age

In [8]:
df['CustomerDOB'].value_counts()

1/1/1800    2672
1/1/91        36
1/1/89        36
1/7/90        34
6/8/91        32
            ... 
19/4/53        1
18/5/72        1
31/5/76        1
7/10/78        1
10/4/99        1
Name: CustomerDOB, Length: 10521, dtype: int64

In [9]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['CustomerDOB'] = pd.to_datetime(df['CustomerDOB'])

In [10]:
df = df.drop(df[df['CustomerDOB'] == '1/1/1800'].index,axis = 0)
df.loc[df.CustomerDOB.dt.year >= 2023, 'CustomerDOB'] = df.loc[df.CustomerDOB.dt.year >= 2023, 'CustomerDOB'] - pd.DateOffset(years = 100)
df['CustomerAge'] = ((pd.to_datetime('today') - df['CustomerDOB'])/np.timedelta64(1, 'Y')).round(0)
df.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),CustomerAge
892845,T892846,C5725279,1984-08-09,M,JASPUR,84447.82,2016-07-09,183210,11858.0,39.0
444994,T444995,C4588538,1994-02-01,M,GURGAON,12549.49,2016-08-16,161300,250.0,30.0
614896,T614897,C2416476,1990-11-14,M,NEW DELHI,33607.65,2016-08-26,102007,3360.0,33.0
457036,T457037,C5940151,1990-09-15,M,HYDERABAD,38238.86,2016-08-21,110438,250.0,33.0
997441,T997442,C5922989,1988-11-27,M,PURBO MEDINIPUR,9506.85,2016-09-14,90810,33.0,35.0


In [11]:
df = df.drop(['TransactionID','CustomerID','CustomerDOB','TransactionDate','TransactionTime', 'CustLocation'],axis=1)
df.head(10)

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge
892845,M,84447.82,11858.0,39.0
444994,M,12549.49,250.0,30.0
614896,M,33607.65,3360.0,33.0
457036,M,38238.86,250.0,33.0
997441,M,9506.85,33.0,35.0
518219,M,635863.63,2688.0,34.0
151835,M,63911.56,499.0,37.0
446575,F,19853.15,280.0,32.0
887527,M,3848.86,203.45,31.0
672565,M,32578.12,350.0,46.0


In [12]:
df['CustGender'].value_counts()

M    34116
F    12881
Name: CustGender, dtype: int64

# Clustering

In [13]:
from pycaret.clustering import *

In [14]:
setup(df)

Unnamed: 0,Description,Value
0,Session id,2952
1,Original data shape,"(46997, 4)"
2,Transformed data shape,"(46997, 4)"
3,Ordinal features,1
4,Numeric features,3
5,Categorical features,1
6,Preprocess,True
7,Imputation type,simple
8,Numeric imputation,mean
9,Categorical imputation,mode


<pycaret.clustering.oop.ClusteringExperiment at 0x7efc5ec80f50>

In [18]:
bank_model = create_model('kmeans', num_clusters=3)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.9731,145121.5264,0.3381,0,0,0


In [19]:
evaluate_model(bank_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [25]:
save_model(bank_model, 'model_bank')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['CustAccountBalance',
                                              'TransactionAmount (INR)',
                                              'CustomerAge'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['CustGender'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['CustGender'],
                                     transformer=OrdinalEncoder(cols=['CustGender'],
                                                                handle_missing='return_nan',
                                                                mapping=[{'col': 'CustGender',
                                                                          'data_type

# Predict

In [26]:
predict = predict_model(bank_model, df)
predict

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge,Cluster
892845,1.0,84447.82,11858.0,39.0,Cluster 0
444994,1.0,12549.49,250.0,30.0,Cluster 0
614896,1.0,33607.65,3360.0,33.0,Cluster 0
457036,1.0,38238.86,250.0,33.0,Cluster 0
997441,1.0,9506.85,33.0,35.0,Cluster 0
...,...,...,...,...,...
774700,1.0,9727.63,350.0,34.0,Cluster 0
854209,0.0,5973.79,220.0,42.0,Cluster 0
320083,1.0,33178.94,178.0,37.0,Cluster 0
602746,0.0,20892.73,430.0,45.0,Cluster 0


In [31]:
predict.sort_values(by='CustAccountBalance', ascending=False)

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge,Cluster
835641,1.0,1.150355e+08,600.00,57.0,Cluster 1
758254,0.0,8.224463e+07,1479.00,93.0,Cluster 1
1005069,0.0,8.224463e+07,3403.00,93.0,Cluster 1
839198,1.0,3.832115e+07,100000.00,74.0,Cluster 2
554797,1.0,1.567426e+07,3984.46,77.0,Cluster 2
...,...,...,...,...,...
1029566,1.0,0.000000e+00,2548.50,89.0,Cluster 0
547210,1.0,0.000000e+00,646.00,61.0,Cluster 0
461513,1.0,0.000000e+00,582.00,33.0,Cluster 0
341696,1.0,0.000000e+00,17665.78,50.0,Cluster 0


In [30]:
predict[predict['Cluster'] == 'Cluster 1']

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge,Cluster
758254,0.0,82244629.9,1479.0,93.0,Cluster 1
1005069,0.0,82244629.9,3403.0,93.0,Cluster 1
835641,1.0,115035495.1,600.0,57.0,Cluster 1
