# Case study

AI Black Belt - Yellow (May 2019).

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load dataframe

In [None]:
df = pd.read_csv("data/day1-case-study.csv")

In [None]:
print("Size of the dataframe:", len(df))

In [None]:
df.head()

## Understanding our data

In [None]:
df.describe()

In [None]:
# Compute correlation matrix (correlation between columns)
plt.figure(figsize=(15,10))
plt.matshow(df.corr(), fignum=1)
plt.show()

## Train Machine learning algorithm

Clustering part: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
from sklearn.cluster import KMeans

Divide the dataset into 4 different classes

In [None]:
kmeans=KMeans(n_clusters=4)

Train the clustering model

In [None]:
kmeans.fit(df)

Get predictions of the model

In [None]:
predictions = kmeans.predict(df)

Retrieve the cluster centers

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.cluster_centers_.shape

In [None]:
pd.Series(predictions).value_counts()

## 2D visualization of the results using Principal Component Analysis (PCA)

Explaination about PCA: 

https://blog.bioturing.com/2018/06/14/principal-component-analysis-explained-simply/

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
# Image comes from https://blog.bioturing.com/2018/06/14/principal-component-analysis-explained-simply/
from IPython.display import Image
Image(filename='figures/day1/PCA.png', height=400, width=400)

### Use PCA in our case

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

Train PCA algorithm

In [None]:
principalComponents = pca.fit_transform(df)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

Add predictions from our Kmeans model

In [None]:
principalDf["label"] = predictions

### Plot PCA values in a 2D graph

In [None]:
plt.figure(figsize = (8,8))
plt.xlabel('Principal Component 1', fontsize = 15)
plt.ylabel('Principal Component 2', fontsize = 15)
plt.title('2 component PCA', fontsize = 20)

targets = [i for i in range(4)]
colors = ['r', 'g', 'b', "m"]
for target, color in zip(targets,colors):
    indicesToKeep = principalDf['label'] == target
    plt.scatter(
        principalDf.loc[indicesToKeep, 'principal component 1'], # X value
        principalDf.loc[indicesToKeep, 'principal component 2'], # Y value
        c = color, # Add colors
        s = 10     # Size of circles
    )
plt.legend(targets)
plt.grid()
plt.show()

## The elbow method

Compute the mean euclidean distance for different number of clusters.

Choose the cluster where there is an "elbow"

In [None]:
from scipy.spatial.distance import cdist

distortions = []
X = np.array(df)
for k in range(1,71, 3):
    print(".", end='')
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

In [None]:
plt.figure(figsize=(10,5))

plt.plot(range(1,71, 3), distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()