# ***Perform clustering (hierarchical,K means clustering and DBSCAN) for the airlines data to obtain optimum number of clusters. ***
# ***Draw the inferences from the clusters obtained. ***


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize

# **Importing Dataset**

In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
# Import Dataset
airline=pd.read_csv('EastWestAirlines.csv')
airline

In [None]:
airline.info()

In [None]:
airline1=airline.drop(['ID#'],axis=1)
airline1

# **Heirarcical clustering**

In [None]:
# Normalize heterogenous numerical data 
airline1_norm=pd.DataFrame(normalize(airline1),columns=airline1.columns)
airline1_norm

In [None]:
# Create Dendrograms
plt.figure(figsize=(10, 7))  
dendograms=sch.dendrogram(sch.linkage(airline1_norm,'complete'))

In [None]:
# Create Clusters (y)
hclusters=AgglomerativeClustering(n_clusters=5,affinity='euclidean',linkage='ward')
hclusters

In [None]:
y=pd.DataFrame(hclusters.fit_predict(airline1_norm),columns=['clustersid'])
y['clustersid'].value_counts()

In [None]:
# Adding clusters to dataset
airline1['clustersid']=hclusters.labels_
airline1

In [None]:
airline1.groupby('clustersid').agg(['mean']).reset_index()

In [None]:
airline2 = airline1.sort_values('clustersid')
airline2.iloc[:,[0,-1]]

# **K-means Clustering**

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [None]:
airline2=pd.read_csv('EastWestAirlines.csv')
airline2

In [None]:
airline2=airline.drop(['ID#'],axis=1)
airline2

In [None]:
# standarization of data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(airline2.iloc[:,0:])

In [None]:
scaled_df

In [None]:
# How to find optimum number of  cluster
#The K-means algorithm aims to choose centroids that minimise the inertia, or within-cluster sum-of-squares criterion:

In [None]:
kmeans = KMeans(n_clusters=4,random_state=0)
predict=kmeans.fit_predict(scaled_df)

In [None]:
predict

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i,random_state=0)
    kmeans.fit(scaled_df)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
#Build Cluster algorithm
clusters_new = KMeans(5, random_state=0)
clusters_new.fit(scaled_df)

In [None]:
clusters_new.labels_

In [None]:
#Assign clusters to the data set
airline2['clusterid_new'] = clusters_new.labels_

In [None]:
airline2

In [None]:
#these are standardized values.
clusters_new.cluster_centers_

In [None]:
airline2.groupby('clusterid_new').agg(['mean']).reset_index()

In [None]:
airline2[airline2['clusterid_new']==0]

In [None]:
airline2[airline2['clusterid_new']==1]

In [None]:
airline2[airline2['clusterid_new']==2]

In [None]:
airline2[airline2['clusterid_new']==3]

# **DBSCAN Clustering**

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [None]:
airline3=pd.read_csv('EastWestAirlines.csv')
airline3

In [None]:
airline3=airline.drop(['ID#'],axis=1)
airline3

In [None]:
airline3.info()

In [None]:
df=airline3

In [None]:
df.values

In [None]:
stscaler=StandardScaler().fit(df.values)
x=stscaler.transform(df.values)

In [None]:
x

In [None]:
dbscan=DBSCAN(eps=1,min_samples=8)
dbscan.fit(x)

In [None]:
dbscan.labels_

In [None]:
cl=pd.DataFrame(dbscan.labels_,columns=['cluster'])

In [None]:
cl

In [None]:
clustered=pd.concat([df,cl],axis=1)

In [None]:
clustered

In [None]:
clustered[clustered['cluster']==-1]

In [None]:
clustered.mean()

In [None]:
clustered[clustered['cluster']==-0]

In [None]:
clustered[clustered['cluster']==1]