# Data Analysis with Python - Credit Card Dataset for Clustering
Important links: 
+ Gihub
+ Colab Google

## Evalution Criteria
Submission will be evaluated using the follow criteria: 
+ Nothing

# This Python3 environment comes with many helpful analytics libraries installed



## Part 1: Data preprocessing
Dataset link: https://www.kaggle.com/datasets/arjunbhasin2013/ccdata

## Importing the libraries and the dataset


In [444]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [445]:
import os 
for dirname, _, filenames in os.walk('/dataset'): 
  for filename in filenames: 
    print(os.path.join(dirname, filename))


In [446]:
dataset = pd.read_csv('../input/dataset/CC GENERAL.csv')


In [447]:
dataset.head()

In [448]:
dataset.shape


In [449]:
dataset.columns

In [450]:
dataset.info()

In [451]:
# categorical colums 
dataset.select_dtypes(include='object').columns

In [452]:
len(dataset.select_dtypes(include='object').columns)

In [453]:
# numerical columns 
dataset.select_dtypes(include=['int64','float64']).columns

In [454]:
# statistical summary
dataset.describe()

In [455]:
# each series in the DataFrame
dataset.hist(bins = 50, figsize = (20,15))

### Dealing with missing values

In [456]:
dataset.isnull().values.any()

In [457]:
dataset.isnull().values.sum()

In [458]:
dataset.columns[dataset.isnull().any()]

In [459]:
len(dataset.columns[dataset.isnull().any()])

In [460]:
dataset.isnull().sum().sort_values(ascending=False).head()

In [461]:
dataset['CREDIT_LIMIT'] = dataset['CREDIT_LIMIT'].fillna(dataset['CREDIT_LIMIT'].mean())
dataset['MINIMUM_PAYMENTS'] = dataset['MINIMUM_PAYMENTS'].fillna(dataset['MINIMUM_PAYMENTS'].mean())

In [462]:
len(dataset.columns[dataset.isnull().any()])

### Encoding categorical data

In [463]:
# categorical columns
dataset.select_dtypes(include='object').columns

In [464]:
dataset.head()

In [465]:
dataset = dataset.drop(columns='CUST_ID')

In [466]:
dataset.head()

In [467]:
# categorical columns
dataset.select_dtypes(include='object').columns

In [468]:
len(# categorical columns
dataset.select_dtypes(include='object').columns)

### Correlation matrix 

In [469]:
corr = dataset.corr()

In [470]:
# heatmap 
plt.figure(figsize=(16, 9))
ax = sns.heatmap(corr, annot=True, cmap='coolwarm')

In [471]:
fig = plt.figure(figsize=(10, 10)) # sets the size of the 4 plot with width as 10 and height as 10
for i,columns in enumerate(dataset.columns, 1): 
    ax = plt.subplot(6,3,i) # creates 3 subplots in one single row
    sns.boxplot(data = dataset, x=dataset[columns], ) # creates box plots for each feature in df_cont dataset
    ax.set_xlabel(None) # removes the labels on x-axis
    ax.set_title(f'Distribution of {columns}') # adds a title to each subplot
    plt.tight_layout(w_pad=3) # adds padding between the subplots
plt.show()

### Splitting the dataset

In [472]:
 # only independent variables not target variable

### Dealing with Outliers
By dropping outliers we can lose many row as there are too many outliers in dataset. So making ranges to deal with extreme values. 

In [473]:
data = dataset

In [474]:
columns=['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT',
        'PAYMENTS', 'MINIMUM_PAYMENTS']

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0        
    data.loc[((data[c]>0)&(data[c]<=500)),Range]=1
    data.loc[((data[c]>500)&(data[c]<=1000)),Range]=2
    data.loc[((data[c]>1000)&(data[c]<=3000)),Range]=3
    data.loc[((data[c]>3000)&(data[c]<=5000)),Range]=4
    data.loc[((data[c]>5000)&(data[c]<=10000)),Range]=5
    data.loc[((data[c]>10000)),Range]=6
 

In [475]:
data.head()

In [476]:
columns=['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 
         'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0
    data.loc[((data[c]>0)&(data[c]<=0.1)),Range]=1
    data.loc[((data[c]>0.1)&(data[c]<=0.2)),Range]=2
    data.loc[((data[c]>0.2)&(data[c]<=0.3)),Range]=3
    data.loc[((data[c]>0.3)&(data[c]<=0.4)),Range]=4
    data.loc[((data[c]>0.4)&(data[c]<=0.5)),Range]=5
    data.loc[((data[c]>0.5)&(data[c]<=0.6)),Range]=6
    data.loc[((data[c]>0.6)&(data[c]<=0.7)),Range]=7
    data.loc[((data[c]>0.7)&(data[c]<=0.8)),Range]=8
    data.loc[((data[c]>0.8)&(data[c]<=0.9)),Range]=9
    data.loc[((data[c]>0.9)&(data[c]<=1.0)),Range]=10
    

In [477]:
columns=['PURCHASES_TRX', 'CASH_ADVANCE_TRX']  

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0
    data.loc[((data[c]>0)&(data[c]<=5)),Range]=1
    data.loc[((data[c]>5)&(data[c]<=10)),Range]=2
    data.loc[((data[c]>10)&(data[c]<=15)),Range]=3
    data.loc[((data[c]>15)&(data[c]<=20)),Range]=4
    data.loc[((data[c]>20)&(data[c]<=30)),Range]=5
    data.loc[((data[c]>30)&(data[c]<=50)),Range]=6
    data.loc[((data[c]>50)&(data[c]<=100)),Range]=7
    data.loc[((data[c]>100)),Range]=8

In [478]:
data.drop(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY',  'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT' ], axis=1, inplace=True)

X = np.asarray(data)

In [479]:
data.head()

In [480]:
X

### Feature scaling

In [481]:
# StandardScaler 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X.shape

In [482]:
X

## Part 2: Elbow method (finding the optimal number of clusters)


In [483]:
from sklearn.cluster import KMeans

In [484]:
n_clusters = 30;
wcss = []
for i in range(1, n_clusters): 
  kmeans = KMeans(i)
  kmeans.fit(X)
  wcss.append(kmeans.inertia_)
plt.plot(wcss, 'bx-')
plt.title('The Elbo method')
plt.xlabel('Number of clusters') 
plt.ylabel('WCSS')
plt.show()


## Part 3: Building the model
Choosing 6 no of clusters¶

In [485]:
from sklearn.cluster import KMeans

In [486]:
kmeans = KMeans(6)

In [487]:
# dependent variable

kmeans.fit(X)
labels = kmeans.labels_


In [488]:
labels

In [489]:
clusters=pd.concat([data, pd.DataFrame({'cluster':labels})], axis=1)
clusters.head()

In [490]:
kmeans

In [491]:
y_kmeans

## Interpretation of Clusters 

In [492]:
for c in clusters:
    grid= sns.FacetGrid(clusters, col='cluster')
    grid.map(plt.hist, c)   

### Part 3.5: Visualization of Clusters
Using PCA to transform data to 2 dimensions for visualization

In [493]:
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)

pca = PCA(2)
pca.fit(dist)
X_PCA = pca.transform(dist)
X_PCA.shape

In [494]:
X_PCA

In [495]:
x, y = X_PCA[:, 0], X_PCA[:, 1]

colors = {0: 'red',
          1: 'blue',
          2: 'green', 
          3: 'yellow', 
          4: 'orange',  
          5:'purple'}

names = {0: 'who make all type of purchases', 
         1: 'more people with due payments', 
         2: 'who purchases mostly in installments', 
         3: 'who take more cash in advance', 
         4: 'who make expensive purchases',
         5:'who don\'t spend much money'}
  
df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(20, 13)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
            color=colors[name],label=names[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.legend()
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()

## Part 4: Getting the output

In [496]:
y_kmeans.shape

In [497]:
y_kmeans = y_kmeans.reshape(len(y_kmeans), 1)

In [498]:
y_kmeans.shape

In [499]:
b = np.concatenate((y_kmeans, dr), axis=1)

In [500]:
dr.columns

In [501]:
dr_final = pd.DataFrame(data=b, columns = ['Cluster_Number', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE'])

In [502]:
dr_final.head()

In [503]:
dr_final.to_csv('Segmented_customers')