# Clustering for Mobile Money Transaction Analysis
Applying clustering techniques for analysing the mobile money transaction

## Import the required libraries

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.utils import resample
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

## Load the dataset

In [41]:
FILE_PATH = '../data/scaled_data.csv'
df = pd.read_csv(FILE_PATH)
df.head()

Unnamed: 0,amount,oldBalInitiator,newBalInitiator,oldBalRecipient,newBalRecipient,transactionType_DEPOSIT,transactionType_PAYMENT,transactionType_TRANSFER,transactionType_WITHDRAWAL
0,-0.401636,-1.717325,-1.754478,-0.484986,-0.414541,0.0,0.0,1.0,0.0
1,-0.67646,-1.86113,-1.88391,-0.485028,-0.511633,0.0,1.0,0.0,0.0
2,-0.677219,-1.861065,-1.884266,-0.485028,-0.508961,0.0,1.0,0.0,0.0
3,-0.401321,-1.861483,-1.899568,-0.464584,-0.394835,0.0,0.0,1.0,0.0
4,-0.677204,-1.739208,-1.761637,-0.485028,-0.508956,0.0,1.0,0.0,0.0


### Dataset Inspection

In [42]:
df.shape

(1685998, 9)

## Obtaining the Array of Features
The features array is gotten directly from the scaled data. 

In [43]:
X = df.values

### Feature Array

In [44]:
X

array([[-0.40163615, -1.71732478, -1.75447806, ...,  0.        ,
         1.        ,  0.        ],
       [-0.67645964, -1.8611297 , -1.88391049, ...,  1.        ,
         0.        ,  0.        ],
       [-0.67721896, -1.86106482, -1.88426605, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.51284357,  1.93021381,  1.8668781 , ...,  0.        ,
         1.        ,  0.        ],
       [-0.67644806,  1.00363535,  0.99856737, ...,  1.        ,
         0.        ,  0.        ],
       [-0.43928967,  1.00317584,  0.98531344, ...,  0.        ,
         1.        ,  0.        ]], shape=(1685998, 9))

## Model Training

### K-Means

In [45]:
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
kmeans.fit(X)

0,1,2
,n_clusters,4
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


### Sampled Silhouette

In [46]:
sample_X, sample_labels = resample(X, kmeans.labels_, n_samples=32000, random_state=42)
kmeans_silhouette = silhouette_score(sample_X, sample_labels)
print(f"K-Means Silhouette Score: {kmeans_silhouette}")


K-Means Silhouette Score: 0.3540329831773125


### Calinski-Harabasz Score

In [47]:
calinski_kmeans = calinski_harabasz_score(X, kmeans.labels_)
print(f"K-Means Calinski-Harabasz Score: {calinski_kmeans}")

K-Means Calinski-Harabasz Score: 882121.1565302226


### Davies-Bouldin Score

In [48]:
davies_kmeans = davies_bouldin_score(X, kmeans.labels_)
print(f"K-Means Davies-Bouldin Score: {davies_kmeans}")

K-Means Davies-Bouldin Score: 0.9016161328996587
