In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

import plotly as py
import plotly.graph_objs as go

from sklearn import preprocessing 
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = '../datasets/winequality-red.csv'
df = pd.read_csv(data)
df.head()

In [None]:
df.drop(['fixed acidity','volatile acidity', 'citric acid', 'free sulfur dioxide', 'total sulfur dioxide', 'chlorides', 'pH', 'sulphates', 'quality'],axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Plotting datasets to figures

plt.figure(1 , figsize = (15 , 6))
n = 0 
for x in ['alcohol' , 'residual sugar' , 'density']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.distplot(df[x] , bins = 15)
    plt.title('Distplot of {}'.format(x))
plt.show()

In [None]:
# Heatmap

plt.figure(1, figsize = (16 ,8))
sns.heatmap(df)
plt.show()

In [None]:
# Dendogram

plt.figure(1, figsize = (16 ,8))
dendrogram = sch.dendrogram(sch.linkage(df, method  = "ward"))

plt.title('Dendrogram')
plt.xlabel('Alcohol')
plt.ylabel('Density')
plt.show()

In [None]:
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage ='ward')

y_hc = hc.fit_predict(df)
y_hc

In [None]:
df['cluster'] = pd.DataFrame(y_hc)

In [None]:
trace1 = go.Scatter3d(
    x= df['alcohol'],
    y= df['density'],
    z= df['residual sugar'],
    mode='markers',
     marker=dict(
        color = df['cluster'], 
        size= 10,
        line=dict(
            color= df['cluster'],
            width= 12
        ),
        opacity=0.8
     )
)
data = [trace1]
layout = go.Layout(
    title= 'Agglomerative Clustering',
    scene = dict(
            xaxis = dict(title  = 'alcohol'),
            yaxis = dict(title  = 'density'),
            zaxis = dict(title  = 'residual sugar')
        )
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
X = df.iloc[:, [1,2]].values
plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster A')
plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster B')
plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster C')
plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster D')
plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster E')
plt.title('Hierarchical Clustering of Alcohol Density')
plt.xlabel('density')
plt.ylabel('residual sugar')
plt.show()

In [None]:
df.head()