# <div style="font-family: cursive; background-color: #03045eff; color: #FFFFFF; padding: 12px; line-height: 1.5;">2. Importing Libraries 📚</div>
<div style="font-family: cursive; line-height: 2; font-size:18px">
    📌 <b>Importing libraries</b> that will be used in this notebook.
</div>

In [None]:
# base
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

# categorical encoding
from sklearn.preprocessing import OneHotEncoder

# Create List of Color Palletes 
color_mix = ['#03045e', '#023e8a', '#0077b6', '#0096c7','#00b4d8', '#48cae4', '#90e0ef','#A5E6F3', '#caf0f8']

# warning
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/german-credit/german_credit_data.csv")

In [None]:
data.head().style.background_gradient(cmap = 'Blues').set_properties(**{'font-family': 'Cursive'})

In [None]:
# *** Print Dataset Info ***
print('*' * 30)
print('** Dataset Info **')
print('*' * 30)
print('Total Rows:', data.shape[0])
print('Total Columns:', data.shape[1])

print('\n')

# *** Print Dataset Detail ***
print('*' * 30 )
print('** Dataset Details **')
print('*' * 30 )
data.info(memory_usage = False)

In [None]:
data.drop(data.columns[0], inplace=True, axis=1)

# <div style="font-family: Cursive; background-color: #03045e; color: #FFFFFF; padding: 12px; line-height: 1.5;">4. Data Exploration 🔍</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
    📌 This section will focused on <b>initial data exploration</b> before pre-process the data.
</div>


## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">4.1 Categorical Variable 🔠</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
  📌 This section will focused on identifying <b>categorical data</b>.
</div>

In [None]:
# Print Categorical Columns 
print('*' * 30)
print('** Categorical Columns **')
print('*' * 30)

categorical = []
for c in data.columns:
    if data[c].dtype == 'object':
        categorical += [c]        

for i in categorical:
    print(i)



## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">4.2 Numerical Variables 🔠</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
    📌 This section will focused on identifying <b>numerical data</b>.
</div>

In [None]:
# Print Numerical Columns 
print('*' * 30)
print('** Numerical Columns **')
print('*' * 30)

numerical = []
for n in data.columns:
    if data[n].dtype == 'int64':
        numerical += [n]
       
for i in numerical:
    print(i)


### <div style="font-family:Cursive; background-color: #0077b6; color: #FFFFFF; padding: 12px; line-height: 1.5;">4.2.1 Descriptive Statistics 📏</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
    📌 Let's have a look at the <b>descriptive statistics</b> of numerical variables.
</div>

In [None]:
# Descriptive Statistics 
data[numerical].describe().T.style.background_gradient(cmap = 'Blues').set_properties(**{'font-family': 'Cursive'})

### <div style="font-family: Cursive; background-color: #0077b6; color: #FFFFFF; padding: 12px; line-height: 1.5;">4.2.2 Distribution of Numerical Variables 📊</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
    📌 This section will show the numerical column distribution using <b>histograms and box plots</b>.
</div>

# <div style="font-family: Trebuchet MS; background-color: #03045e; color: #FFFFFF; padding: 12px; line-height: 1.5;">5. EDA 🔍📈👓</div>
<div style="font-family: Cursive; line-height: 2; font-size:18px">
    📌 This section will perform some <b>EDA</b> to get more insights about dataset.
</div>

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.1 Distribution of Numerical Variables 📊: Age</div>

In [None]:
# Variable, Color & Plot Size 
var = data['Age']
color = color_mix[0]
fig = plt.figure(figsize = (14, 10))

# Skewness & Kurtosis 
print('*' * 40)
print('** Age Skewness & Kurtosis**')
print('*' * 40)
print('Skewness: {:.3f}'.format(var.skew(axis = 0, skipna = True)))
print('Kurtosis: {:.3f}'.format(var.kurt(axis = 0, skipna = True)))
print('\n')

# General Title
fig.suptitle('Age Distribution', fontweight = 'bold', fontsize = 16)
fig.subplots_adjust(top = 0.9)

#  Histogram 
ax_1=fig.add_subplot(1, 2, 2)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = var, kde = True, color = color)
plt.xlabel('Age', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

#  Box Plot 
ax_3 = fig.add_subplot(1, 2, 1)
plt.title('Box Plot', fontweight = 'bold', fontsize = 14)
sns.boxplot(y = var, data = data, color = color, linewidth = 1.5)
plt.ylabel('Age', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'y', alpha = 0.2)
plt.show();

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.2 Distribution of Numerical Variables 📊: Job</div>

In [None]:
# Variable, Color & Plot Size 
var = data['Job']
color = color_mix[1]
fig = plt.figure(figsize = (14, 10))

# Skewness & Kurtosis 
print('*' * 40)
print('** Job Skewness & Kurtosis **')
print('*' * 40)
print('Skewness: {:.3f}'.format(var.skew(axis = 0, skipna = True)))
print('Kurtosis: {:.3f}'.format(var.kurt(axis = 0, skipna = True)))
print('\n')

# General Title 
fig.suptitle('Job Distribution', fontweight = 'bold', fontsize = 16)
fig.subplots_adjust(top = 0.9)

# Histogram 
ax_1=fig.add_subplot(1, 2, 2)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = var, kde = True, color = color)
plt.xlabel('Job', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

#  Box Plot 
ax_2 = fig.add_subplot(1, 2, 1)
plt.title('Box Plot', fontweight = 'bold', fontsize = 14)
sns.boxplot(y = var, data = data, color = color, linewidth = 1.5)
plt.ylabel('Job', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'y', alpha = 0.2)
plt.show();



## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.3 Distribution of Numerical Variables 📊: Credit amount</div>

In [None]:
#  Variable, Color & Plot Size 
var = data['Credit amount']
color = color_mix[2]
fig = plt.figure(figsize = (14, 10))

# Skewness & Kurtosis 
print('*' * 40)
print('** Credit Skewness & Kurtosis**')
print('*' * 40)
print('Skewness: {:.3f}'.format(var.skew(axis = 0, skipna = True)))
print('Kurtosis: {:.3f}'.format(var.kurt(axis = 0, skipna = True)))
print('\n')

#  General Title
fig.suptitle('Credit Amount Distribution', fontweight = 'bold', fontsize = 16)
fig.subplots_adjust(top = 0.9)

#  Histogram 
ax_1=fig.add_subplot(1, 2, 2)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = var, kde = True, color = color)
plt.xlabel('Credit Amount', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

#  Box Plot 
ax_2 = fig.add_subplot(1, 2, 1)
plt.title('Box Plot', fontweight = 'bold', fontsize = 14)
sns.boxplot(y = var, data = data, color = color, linewidth = 1.5)
plt.ylabel('Credit Amount', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'y', alpha = 0.2)
plt.show();

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.4 Distribution of Numerical Variables 📊: Duration</div>

In [None]:
# Variable, Color & Plot Size 
var = data['Duration']
color = color_mix[3]
fig = plt.figure(figsize = (14, 10))

# Skewness & Kurtosis 
print('*' * 40)
print('** Duration Skewness & Kurtosis**')
print('*' * 40)
print('Skewness: {:.3f}'.format(var.skew(axis = 0, skipna = True)))
print('Kurtosis: {:.3f}'.format(var.kurt(axis = 0, skipna = True)))
print('\n')

#  General Title 
fig.suptitle('Duration Distribution', fontweight = 'bold', fontsize = 16)
fig.subplots_adjust(top = 0.9)

#  Histogram 
ax_1=fig.add_subplot(1, 2, 2)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = var, kde = True, color = color)
plt.xlabel('Duration', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

# Box Plot 
ax_2 = fig.add_subplot(1, 2, 1)
plt.title('Box Plot', fontweight = 'bold', fontsize = 14)
sns.boxplot(y = var, data = data, color = color, linewidth = 1.5)
plt.ylabel('Duration', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'y', alpha = 0.2)
plt.show();

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.5 Heatmap 🔥</div>

In [None]:
# Correlation Map (Heatmap) 
plt.figure(figsize = (12, 8))
sns.heatmap(data.corr(), square = True, annot = True, cmap = 'Blues', linewidths = 0.1)
plt.suptitle('Correlation Map', fontweight = 'heavy', fontsize = 14)
plt.tight_layout(rect = [0, 0.04, 1, 1.01])

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.6 Analysis of Categorical Variables 🆎: Sex</div>

In [None]:
# Setting Colors, Labels, Order
colors=color_mix[0:9]
labels=data['Sex'].dropna().unique()
order=data['Sex'].value_counts().index

# Size for Both Figures
plt.figure(figsize=(18, 8))
plt.suptitle('Sex', fontweight='heavy', fontsize='16')

# Histogram 
countplt = plt.subplot(1, 2, 1)
plt.title('Histogram', fontweight='bold', fontsize=14)
ax = sns.countplot(x='Sex', data=data, palette=colors, order=order, alpha=0.85)
for rect in ax.patches:
     ax.text (rect.get_x()+rect.get_width()/2, rect.get_height()+10,rect.get_height(), horizontalalignment='center', 
             fontsize=12)
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.xlabel('Sex', fontweight='bold', fontsize=11)
plt.ylabel('Total', fontweight='bold', fontsize=11)
plt.grid(axis='y', alpha=0.4)
countplt

# Pie Chart 
plt.subplot(1, 2, 2)
plt.title('Pie Chart', fontweight='bold', fontsize=14)
plt.pie(data['Sex'].value_counts(), colors=colors, labels=order, pctdistance=0.67, autopct='%.2f%%', 
        wedgeprops=dict(alpha=0.8, edgecolor='black'), textprops={'fontsize':12})
centre=plt.Circle((0, 0), 0.45, fc='white')
plt.gcf().gca().add_artist(centre);
    
    
# Print "Sex" Values 
print('*' * 30)
print('Sex')
print('*' * 30)
data.Sex.value_counts()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.7 Analysis of Categorical Variables 🆎: Housing</div>

In [None]:
# Setting Labels, Order
labels=data['Housing'].dropna().unique()
order=data['Housing'].value_counts().index

# Size for Both Figures
plt.figure(figsize=(18, 8))
plt.suptitle('Housing', fontweight='heavy', fontsize='16')

# Histogram 
countplt = plt.subplot(1, 2, 1)
plt.title('Histogram', fontweight='bold', fontsize=14)
ax = sns.countplot(x='Housing', data=data, palette=colors, order=order, alpha=0.85)
for rect in ax.patches:
     ax.text (rect.get_x()+rect.get_width()/2, rect.get_height()+10,rect.get_height(), horizontalalignment='center', 
             fontsize=12)
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.xlabel('Housing', fontweight='bold', fontsize=11)
plt.ylabel('Total', fontweight='bold', fontsize=11)
plt.grid(axis='y', alpha=0.4)
countplt

# Pie Chart 
plt.subplot(1, 2, 2)
plt.title('Pie Chart', fontweight='bold', fontsize=14)
plt.pie(data['Housing'].value_counts(), colors=colors, labels=order, pctdistance=0.67, autopct='%.2f%%', 
        wedgeprops=dict(alpha=0.8, edgecolor='black'), textprops={'fontsize':12})
centre=plt.Circle((0, 0), 0.45, fc='white')
plt.gcf().gca().add_artist(centre);
    
    
# Print "Sex" Values 
print('*' * 30)
print('Housing')
print('*' * 30)
data.Housing.value_counts()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.8 Analysis of Categorical Variables 🆎: Saving accounts</div>

In [None]:
# Setting Labels, Order
labels=data['Saving accounts'].dropna().unique()
order=data['Saving accounts'].value_counts().index

# Size for Both Figures
plt.figure(figsize=(18, 8))
plt.suptitle('Saving accounts', fontweight='heavy', fontsize='16')

# Histogram 
countplt = plt.subplot(1, 2, 1)
plt.title('Histogram', fontweight='bold', fontsize=14)
ax = sns.countplot(x='Saving accounts', data=data, palette=colors, order=order, alpha=0.85)
for rect in ax.patches:
     ax.text (rect.get_x()+rect.get_width()/2, rect.get_height()+10,rect.get_height(), horizontalalignment='center', 
             fontsize=12)
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.xlabel('Saving accounts', fontweight='bold', fontsize=11)
plt.ylabel('Total', fontweight='bold', fontsize=11)
plt.grid(axis='y', alpha=0.4)
countplt

# Pie Chart 
plt.subplot(1, 2, 2)
plt.title('Pie Chart', fontweight='bold', fontsize=14)
plt.pie(data['Saving accounts'].value_counts(), colors=colors, labels=order, pctdistance=0.67, autopct='%.2f%%', 
        wedgeprops=dict(alpha=0.8, edgecolor='black'), textprops={'fontsize':12})
centre=plt.Circle((0, 0), 0.45, fc='white')
plt.gcf().gca().add_artist(centre);
    
    
# Print "Saving accounts" Values 
print('*' * 30)
print('Saving accounts')
print('*' * 30)
data['Saving accounts'].value_counts()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.9 Analysis of Categorical Variables 🆎: Checking account</div>

In [None]:
# Setting Labels, Order
labels=data['Checking account'].dropna().unique()
order=data['Checking account'].value_counts().index

# Size for Both Figures
plt.figure(figsize=(18, 8))
plt.suptitle('Checking account', fontweight='heavy', fontsize='16')

# Histogram 
countplt = plt.subplot(1, 2, 1)
plt.title('Histogram', fontweight='bold', fontsize=14)
ax = sns.countplot(x='Checking account', data=data, palette=colors, order=order, alpha=0.85)
for rect in ax.patches:
     ax.text (rect.get_x()+rect.get_width()/2, rect.get_height()+5,rect.get_height(), horizontalalignment='center', 
             fontsize=12)
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.xlabel('Checking account', fontweight='bold', fontsize=11)
plt.ylabel('Total', fontweight='bold', fontsize=11)
plt.grid(axis='y', alpha=0.4)
countplt

# Pie Chart 
plt.subplot(1, 2, 2)
plt.title('Pie Chart', fontweight='bold', fontsize=14)
plt.pie(data['Checking account'].value_counts(), colors=colors, labels=order, pctdistance=0.67, autopct='%.2f%%', 
        wedgeprops=dict(alpha=0.8, edgecolor='black'), textprops={'fontsize':12})
centre=plt.Circle((0, 0), 0.45, fc='white')
plt.gcf().gca().add_artist(centre);
    
    
# Print "Checking account" Values 
print('*' * 30)
print('Checking account')
print('*' * 30)
data['Checking account'].value_counts()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">5.10 Analysis of Categorical Variables 🆎: Purpose</div>

In [None]:
# Setting Labels, Order
labels=data['Purpose'].dropna().unique()
order=data['Purpose'].value_counts().index

# Size for Both Figures
plt.figure(figsize=(18, 8))
plt.suptitle('Purpose', fontweight='heavy', fontsize='16')

# Histogram 
countplt = plt.subplot(1, 2, 1)
plt.title('Histogram', fontweight='bold', fontsize=14)
ax = sns.countplot(y='Purpose', data=data, palette=colors, order=order, alpha=0.85)
for rect in ax.patches:
    width, height = rect.get_width(), rect.get_height()
    x, y = rect.get_xy()
    ax.text (x+width+10, y+height/2, '{:.0f}'.format(width), horizontalalignment='center', verticalalignment='center')
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.xlabel('Purpose', fontweight='bold', fontsize=11)
plt.ylabel('Total', fontweight='bold', fontsize=11)
plt.grid(axis='x', alpha=0.5)
countplt


# Pie Chart 
plt.subplot(1, 2, 2)
plt.title('Pie Chart', fontweight='bold', fontsize=14)
plt.pie(data['Purpose'].value_counts(), colors=colors, labels=order, pctdistance=0.67, autopct='%.2f%%', 
        wedgeprops=dict(alpha=0.8, edgecolor='black'), textprops={'fontsize':12})
centre=plt.Circle((0, 0), 0.45, fc='white')
plt.gcf().gca().add_artist(centre);
    
    
# Print "Purpose" Values 
print('*' * 30)
print('Purpose')
print('*' * 30)
data['Purpose'].value_counts()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">6.1 Missing Values Analysis</div>

In [None]:
#  Total Missing Values in each Columns 
print('*' * 45)
print('** Total Missing Values in each Columns **')
print('*' * 45)
data.isnull().sum()

In [None]:
data['Saving accounts'] = data["Saving accounts"].fillna("none")
data["Checking account"] = data["Checking account"].fillna("none")

In [None]:
#  Total Missing Values in each Columns 
print('*' * 45)
print('** Total Missing Values in each Columns after imputation**')
print('*' * 45)
data.isnull().sum()

## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">6.2 Transform Data 🔗</div>

In [None]:
# Variable, Color & Plot Size 
var = data['Age']
color = color_mix[0:7]
fig = plt.figure(figsize = (14, 10))

# Size for all figures
plt.suptitle('Visualisation of log transformation', fontweight='heavy', fontsize='16')

# Histogram 
ax_1 = fig.add_subplot(2, 3, 1)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = var, kde = True, color = color[0])
plt.xlabel('Age', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

ax_2 = fig.add_subplot(2, 3, 2)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = data['Credit amount'], kde = True, color = color[2])
plt.xlabel('Credit', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

ax_3 = fig.add_subplot(2, 3, 3)
plt.title('Histogram Plot', fontweight = 'bold', fontsize = 14)
sns.histplot(data = data, x = data['Duration'], kde = True, color = color[4])
plt.xlabel('Duration', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

# Histogram after log transform

ax_1 = fig.add_subplot(2, 3, 4)
plt.title('After Log Transform', fontweight = 'bold', fontsize = 12)
sns.histplot(data = data, x = np.log(data['Age']), kde = True, color = color[0])
plt.xlabel('Age', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

ax_2 = fig.add_subplot(2, 3, 5)
plt.title('After Log Transform', fontweight = 'bold', fontsize = 12)
sns.histplot(data = data, x = np.log(data['Credit amount']), kde = True, color = color[2])
plt.xlabel('Credit', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)

ax_3 = fig.add_subplot(2, 3, 6)
plt.title('After Log Transform', fontweight = 'bold', fontsize = 12)
sns.histplot(data = data, x = np.log(data['Duration']), kde = True, color = color[4])
plt.xlabel('Duration', fontweight = 'regular', fontsize = 11)
plt.ylabel('Total', fontweight = 'regular', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)



## <div style="font-family: Cursive; background-color: #023e8a; color: #FFFFFF; padding: 12px; line-height: 1.5;">6.2 Feature Engineering 🛠</div>

In [None]:
# Applying one-hot encoder on categoricalfeatures
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(data[categorical]))

# adding back indexes
OH_cols.index = data.index

# Keep the continuous features
df_cont = data.drop(categorical, axis=1)

# adding back encoded categoricals to continous features
df_OH = pd.concat([df_cont, OH_cols], axis=1)

#  Print outcome
print('*' * 40)
print('** All features are now numerical**')
print('*' * 40)

In [None]:
#  Check that all categorical features are transformed into numeric features 
print('*' * 55)
print('** Categorical Features ---> Numerical Features **')
print('*' * 55)
df_OH

# <div style="font-family: Cursive; background-color: #03045e; color: #FFFFFF; padding: 12px; line-height: 1.5;">7. K-means Clustering </div>


In [None]:
num_columns = ["Age","Credit amount", "Duration"]
cluster_data = data.loc[:,num_columns]

In [None]:
cluster_log = np.log(cluster_data)

In [None]:
scaler = StandardScaler()
cluster_scaled = scaler.fit_transform(cluster_log)

In [None]:
clusters_range = range(3,15)
inertia =[]

for c in clusters_range:
    kmeans = KMeans(n_clusters=c, init='k-means++', random_state=0, n_init=30, max_iter=100)
    clusters = kmeans.fit_predict(cluster_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(18, 8))
ax = sns.lineplot(clusters_range, inertia, palette=colors, marker='o', markersize=10, markeredgewidth=1, markeredgecolor='black')
plt.title('Elbow method', fontweight = 'bold', fontsize = 14)
plt.xlabel('Number of Clusters', fontweight = 'bold', fontsize = 11)
plt.ylabel('Inertia', fontweight = 'bold', fontsize = 11)
plt.grid(axis = 'x', alpha = 0.2)
plt.grid(axis = 'y', alpha = 0.2)



In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples

for n_clusters in clusters_range:
    km = KMeans (n_clusters=n_clusters)
    preds = km.fit_predict(cluster_scaled)
    centers = km.cluster_centers_

    score = silhouette_score(cluster_scaled, preds, metric='euclidean')
    print('*' * 50)
    print ("For n_clusters = {}, silhouette score is {:.2f}".format(n_clusters, score))
    print('*' * 50)

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=0, n_init=30, max_iter=100)

# Fitting data in model and predicting
clusters = kmeans.fit_predict(cluster_scaled)
cluster_data['cluster'] = clusters


In [None]:
cluster_data

In [None]:
colors = None
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20,6))
sns.scatterplot(x="Credit amount",y="Duration", hue="cluster", palette=colors, data=cluster_data, ax=ax1)
sns.scatterplot(x="Age",y="Credit amount", hue="cluster", palette=colors, data=cluster_data, ax=ax2)
sns.scatterplot(x="Age",y="Duration", hue="cluster", palette=colors, data=cluster_data, ax=ax3)
plt.tight_layout()

In [None]:
grouped_km = cluster_data.groupby(['cluster']).mean().round(1)
grouped_km