In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, normalize
import scipy.cluster.hierarchy as sch
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [3]:
from google.colab import files
uploaded=files.upload()

Saving EastWestAirlines.xlsx to EastWestAirlines.xlsx


In [4]:
airline=pd.read_excel('EastWestAirlines.xlsx', sheet_name='data')
airline

Unnamed: 0,ID#,Balance,Qual_miles,cc1_miles,cc2_miles,cc3_miles,Bonus_miles,Bonus_trans,Flight_miles_12mo,Flight_trans_12,Days_since_enroll,Award?
0,1,28143,0,1,1,1,174,1,0,0,7000,0
1,2,19244,0,1,1,1,215,2,0,0,6968,0
2,3,41354,0,1,1,1,4123,4,0,0,7034,0
3,4,14776,0,1,1,1,500,1,0,0,6952,0
4,5,97752,0,4,1,1,43300,26,2077,4,6935,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3994,4017,18476,0,1,1,1,8525,4,200,1,1403,1
3995,4018,64385,0,1,1,1,981,5,0,0,1395,1
3996,4019,73597,0,3,1,1,25447,8,0,0,1402,1
3997,4020,54899,0,1,1,1,500,1,500,1,1401,0


In [None]:
airline.info()

In [None]:
airline.describe()

In [None]:
#Checking for null values
print(airline.isnull().sum())

In [None]:
#checking for duplicate values
print(airline.duplicated().sum())

In [None]:
# dropping the ID column 
# First copying the dataframe to a variable data

data = airline.copy()

In [None]:
data.drop('ID#', axis=1,  inplace=True)
data

In [None]:
#renaming the last column
data= data.rename({'Award?':'Award'},axis=1)
data

#Standardizing and Normalizing the data

In [None]:
data_std = StandardScaler().fit_transform(data)
data_norm = normalize(data)

In [None]:
data_std

In [None]:
data_norm

#Now that we have the standardized data, let us find various Clustering Algorithms
#Hierarchical
#K-means
#DBSCAN

In [None]:
#Hierarchical 
#plotting the dendrogram 
sns.set_style('darkgrid')
plt.figure(figsize=(10,7))
dendrogram = sch.dendrogram(sch.linkage(data_norm, method='complete'))
plt.show()

In [None]:
hclustering = AgglomerativeClustering(n_clusters=5)
hclustering

In [None]:
hc_result = pd.DataFrame(hclustering.fit_predict(data_norm), columns=['ClusterID'])
hc_result.value_counts()

In [None]:
data1 = data.copy()

In [None]:
data1['HCluster'] = hclustering.labels_

In [None]:
# Adding the labels to the data and naming it Hierarchical cluster or HCluster
data1

In [None]:
# Finding the mean in the data after clustering 
data1.groupby('HCluster').agg(['mean'])

In [None]:
# Plotting the clusters

plt.figure(figsize=(10,6))
plt.scatter(data1['HCluster'], data1['Balance'], c = hclustering.labels_)
plt.xlabel('Clusters')
plt.ylabel('Balance')
plt.title('Scatter Plot for Hierarchical Clustering Distribution')

#K-Means clustering

In [None]:
# Selecting the optimum number for the k
# Creating WCSS- Within Clusters Sum of Squares

wcss =[]
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(data_std)
    wcss.append(kmeans.inertia_)

In [None]:
#plotting the Elbow Graph

plt.plot(range(1,11), wcss)
plt.xlabel('Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Graph')

In [None]:
#from the Elbow Graph we can see gradual drops starting from the value 4. Thus selecting the Cluster number as 4
# k = 4

kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(data_std)

In [None]:
kmeans.labels_

In [None]:
#Writing the data to a new dataframe and we call it data2 

data2 = data.copy()
data2['KCluster'] = kmeans.labels_
data2

In [None]:
# Checking for the clusters for distribution of mean values

data2.groupby('KCluster').agg(['mean'])

In [None]:
# Plotting the scatter plots for clusters

plt.scatter(data2['KCluster'], data['Balance'])

#DBSCAN Clustering
•Determining Eps and MinPts
•By rule of thumb minPts >3 and depends on the dimention of the data

•Since the features in this data are 4, thus the minPts = 2 * dim = 2 * 11 = 22

•To determine Eps, we calculate the elbow graph for k-neighbours using the 

•NearestNeighbours setting n = 22

In [None]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=22)
neighbors_fit = neighbors.fit(data_std)
distances, indices = neighbors_fit.kneighbors(data_std)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.show()

In [None]:
# Thus we can take the eps = 2.5 from the k neighbour graph elbow

dbscan = DBSCAN(eps= 2.5, min_samples= 21)
dbscan.fit(data_std)

In [None]:
silhouette_score(data_std,dbscan.labels_)

In [None]:
# 0.63 is a good Silhouette Score for the data 
# Creating a data3 for DBSCAN result appendment 
data3 = data.copy()
data3['DBS'] = dbscan.labels_

In [None]:
# Aggregate function 

data3['DBS'].value_counts()

In [None]:
#plotting the graph

data3.groupby('DBS').count()['Balance'].plot(kind='bar')
plt.xlabel('DBSCAN clusters with noise(-1)')
plt.ylabel('Count of Datapoints')
plt.title('DBSCAN Clustering')
plt.show()