<a href="https://colab.research.google.com/github/Christina-Joseph/CE888-7-SP-Data-Science-and-Decision-Making/blob/main/exploration/lab_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.datasets import load_wine

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

* We have loaded the necessary libraries above
* Now let's load the data

In [None]:
data = load_wine(as_frame=True)
df = data.data.copy()  # features
# Look at the documentation (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html)
# and save the label in a column called 'y' in df.

df['y'] = data.target 
df.head()

In [None]:
df['alcohol'].corr(df['ash'])

In [None]:
df['alcohol'].mean(axis=0)

In [None]:
#How many samples and features are there?
row = len(df)
column = len(df.columns) - 1          
# YOUR CODE HERE
print('number of samples:', row, 'number of features:', column)

In [None]:
# Quiz Q1
# How many instances are there for each class? 

# YOUR CODE HERE
df['y'].value_counts()

In [None]:
# Calculate the means, medians, and standard deviations of each feature
# YOUR CODE HERE
df.describe()

In [None]:
# Plot a boxplot of each feature (NOT THE LABEL!!) (all in one plot -- you can do this with a single line of code).
# Quiz Q2: Is there any feature that is in a clearly different scale than the others?

# YOUR CODE HERE
column = list(df.columns)
boxplot = df.boxplot(column= column)  

In [None]:
# Let's look at the distributions of values of our features. Draw a histogram for each of the features

# YOUR CODE HERE
_ = df.hist(bins=50, figsize=(20,15))

Let's do some multivariate analysis

In [None]:
# Calculate the correlations between each pair of variables

# YOUR CODE HERE
correlations = df.corr()
#print(correlations['y'].sort_values(ascending=False))
# remove y from correlations
correlations_no_output = correlations.drop(labels = 'y', axis = 0)
correlations_no_output = correlations.drop(labels = 'y', axis = 1)

# Plot a heatmap of the correlations between pairs of FEATURES (i.e., don't include the target!)
# YOUR CODE HERE
f, ax = plt.subplots(figsize=(10, 7))
ax = sns.heatmap(correlations_no_output, annot=True, center=0,  cmap="YlGnBu")

In [None]:
# What is the highest correlation between features?
print('flavanoids and total_phenols are the highest correlation')



You should have noticed that some of the features are highly correlated. 

For this reason, we can run into trouble when trying to train a classifier due to multicolinearity.

Let's do PCA on this dataset to remove these correlations

In [None]:
# Let's do PCA:
# HINT: I suggest you attempt Q5 of the quiz and then use the function you developed
#       in this and some of the following cells.

# 1. Standardise your data using a StandardScaler. Make sure you're not scaling the 'y'!

# YOUR CODE HERE
df_x = df.drop('y', axis=1)
num_features = list(df_x.columns)

colTransformer = ColumnTransformer([('num', StandardScaler(), num_features)
                                    ],
                                   remainder='passthrough'  # i.e., leave the rest of the columns untouched -- otherwise, they're dropped
                                  )

x_processed = colTransformer.fit_transform(df_x)
print(x_processed.shape)

# 2. Do PCA with 2 components. How much variance is retained from the original dataset?

# YOUR CODE HERE
pca_n = PCA(n_components=2)
x_pca_n = pca_n.fit_transform(x_processed)
np.cumsum(pca_n.explained_variance_ratio_)

In [None]:
# Plot your two components, using the label from the data to colour the points

# YOUR CODE HERE
plt.figure(figsize=(16,10))
for i in range(1,2):
    plt.subplot(2, 3, i)
    plt.scatter(x_pca_n[:, i-1], x_pca_n[:, i])
    plt.xlabel('PC %d' % i)
    plt.ylabel('PC %d' % (i+1))
plt.savefig('students_pca_pcs.pdf', dpi=300)
plt.show()

In [None]:
# How many components do you need to keep at least 85% of the variance in the dataset? (Q6)

# YOUR CODE HERE
pca_n = PCA(n_components=0.85)
x_pca_n = pca_n.fit_transform(x_processed)
np.cumsum(pca_n.explained_variance_ratio_)

In [None]:
# How much variance is retained if we do PCA with 3 components? (Q7)

# YOUR CODE HERE
pca_n = PCA(n_components=3)
x_pca_n = pca_n.fit_transform(x_processed)
np.cumsum(pca_n.explained_variance_ratio_)

## Clustering

Let's see how many clusters does K-Means find.



In [None]:
# Use the elbow method to select the number of clusters in your data using 3 PCs.

# YOUR CODE HERE
inertias, sil = [], []
for k in range(1, 12):
    kmeans = KMeans(n_clusters=k)
    y_pred = kmeans.fit_predict(x_pca_n)
    inertias.append(kmeans.inertia_)
    if k > 1:
        sil.append(silhouette_score(x_pca_n, y_pred))

# Let's plot inertia vs number of clusters
plt.figure(figsize=(8,5))
plt.plot(range(1, 12), inertias, 'o-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.savefig('elbow_moons.pdf', dpi=300)
plt.show()

In [None]:
# Now use the silhouette score to choose between the two candidate k values from the previous cell

# YOUR CODE HERE
plt.figure(figsize=(8,5))
plt.plot(range(2, 12), sil, 'o-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.savefig('elbow_moons_sil.pdf', dpi=300)
plt.show()

In [None]:
# Visualise the first two pcs (using a scatterplot) using the labels from KMeans with your chosen value of k

# YOUR CODE HERE
kmean = KMeans(n_clusters=3)
kmean.fit(x_pca_n)

plt.figure(figsize=(16,10))
for i in range(1, 2):
    plt.subplot(2, 3, i)
    plt.scatter(x_pca_n[:, i-1], x_pca_n[:, i], c=kmean.labels_)
    plt.xlabel('PC %d' % i)
    plt.ylabel('PC %d' % (i+1))
plt.savefig('students_pca_clusters_kmeans=2.pdf', dpi=300)
plt.show()

Now we can try to understand our clusters.

In [None]:
# In the original (before PCA and before standardisation) dataframe, add a column called 'cluster' that contains
# the cluster assignment from above

# YOUR CODE HERE
original = data.data.copy()
original['cluster'] = kmean.labels_
original

In [None]:
# Do a boxplot of each feature, separating between clusters

# YOUR CODE HERE
column = list(original.columns)
column.remove('cluster')
for i in column:
  boxplot = original.boxplot(column = i, by='cluster', figsize = (5,6))

Write your observations here.
For example: is there a clear difference between the clusters in terms of alcohol content? What about flavanoids?

In [None]:
# Feel free to keep playing with the data here!