# 1. Loading the Data

In [None]:
# Import the necessary libraries.
import math

# Libraries for making plots.
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for calculations
import pandas as pd
import numpy as np
np.random.seed(2600)

In [None]:
data = pd.read_csv("https://github.com/DecodedCo/ibm-advanced-analytics/blob/master/data.csv?raw=true")

In [None]:
print data.shape

In [None]:
print '\n', data.head()

In [None]:
# Show all the available columns
print data.columns

In [None]:
# The failures is what we are interested in, we can get some basic understanding of 
# how that variable is distributed.
print data.failures.describe()

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 2.1 Exploratory Visualisation

In [None]:
SELECTED_COLUMN = 'Vacuum'


# Plotting code.
plt.figure(figsize=(15,5))
ax1 = plt.subplot(1, 2, 1)
data[data.failures==0][SELECTED_COLUMN].hist()
ax1.set_xlim([data[SELECTED_COLUMN].min(), data[SELECTED_COLUMN].max()])
ax2 = plt.subplot(1, 2, 2)
data[data.failures==1][SELECTED_COLUMN].hist(color='darkred')
ax2.set_xlim([data[SELECTED_COLUMN].min(), data[SELECTED_COLUMN].max()])

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 2.2 Exploratory Visualisation

In [None]:
VARIABLE_ONE = 'Temp-1'
VARIABLE_TWO = 'Vacuum'
COLOR_VARIABLE = 'Humidity-1'


plt.figure(figsize=(12,8))
ax1 = plt.subplot(1, 1, 1)
data.plot.scatter(x=VARIABLE_ONE, y=VARIABLE_TWO, s=12, c=COLOR_VARIABLE, alpha=0.2, cmap=plt.get_cmap('coolwarm'), ax=ax1)
data[data.failures==1].plot.scatter(x=VARIABLE_ONE, y=VARIABLE_TWO, s=24, alpha=0.9, c='none', edgecolors='black', ax=ax1)

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 2.3 Exploratory Visualisation

In [None]:
sns.clustermap(data.corr())

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 3.1 Dimensionality Reduction: PCA

In [None]:
# This creates data-blocks that we can manipulate.
X = data[data.columns[1:]].values
y = data.failures.values

In [None]:
# Load the PCA Library. This one is available on scikit-learn
from sklearn.decomposition import PCA

# Create the PCA, give the data to the PCA and `fit` the analysis.
pca = PCA(n_components=3)
pca.fit(X)

# Transform the original data to new data.
X_pca = pca.transform(X)

# Store the data in the original data-frame.
data['pca-1'], data['pca-2'], data['pca-3'] = X_pca[:,0], X_pca[:,1], X_pca[:,2]

In [None]:
SELECTED_COLUMN_FOR_COLOR = 'Vacuum'


plt.figure(figsize=(16,5))
ax1 = plt.subplot(1, 2, 1)
data.plot.scatter(x='pca-1', y='pca-2', c=SELECTED_COLUMN_FOR_COLOR, s=12, alpha=0.2, cmap=plt.get_cmap('coolwarm'), ax=ax1)
data[data.failures==1].plot.scatter(x='pca-1', y='pca-2', s=24, alpha=0.9, c='none', edgecolors='black', ax=ax1)
ax2 = plt.subplot(1, 2, 2)
data.plot.scatter(x='pca-1', y='pca-3', s=12, c=SELECTED_COLUMN_FOR_COLOR, alpha=0.2, cmap=plt.get_cmap('coolwarm'), ax=ax2)
data[data.failures==1].plot.scatter(x='pca-1', y='pca-3', s=24, alpha=0.9, c='none', edgecolors='black', ax=ax2)

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 3.2 Dimensionality Reduction: T-SNE

In [None]:
import time
time_start = time.time()

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1, perplexity=100, n_iter=500)
tsne_results = tsne.fit_transform(X)

print 't-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)

In [None]:
data['tsne-1.1'], data['tsne-2.1'] = tsne_results[:,0], tsne_results[:,1]

In [None]:
SELECTED_COLUMN_FOR_COLOR = 'Vacuum'


plt.figure(figsize=(12,8))
ax1 = plt.subplot(1, 1, 1)
data.plot.scatter(x='tsne-1.1', y='tsne-2.1', s=10, c=SELECTED_COLUMN_FOR_COLOR, alpha=0.2, cmap=plt.get_cmap('coolwarm'), ax=ax1)
data[data.failures==1].plot.scatter(x='tsne-1.1', y='tsne-2.1', s=24, alpha=0.9, c='none', edgecolors='black', ax=ax1)

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 4.1 Models: Decision Tree

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score

# Create and fit the decision tree.
clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X, y)

# Get the predicted values
y_pred = clf.predict(X)

# How good is it? Lets check the accuracy
accuracy = accuracy_score(y, y_pred)
print 'Accuracy: {}'.format(accuracy)

In [None]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y, y_pred, labels=[0,1])

# cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

sns.heatmap(cnf_matrix, annot=True, fmt="d", linewidths=.5)
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
# import graphviz 

# dot_data = tree.export_graphviz(clf, feature_names=data.columns[1:25], out_file=None) 
# graph = graphviz.Source(dot_data) 
# graph.render("tree-visualisation") 

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 4.2 Models: Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(50, 20), max_iter=1000)
nn_clf.fit(X, y)

# Get the predicted values
y_pred = nn_clf.predict(X)

# How good is it? Lets check the accuracy
accuracy = accuracy_score(y, y_pred)

print 'Accuracy: {}'.format(accuracy)

<br/>
<br/>
<br/>
<br/>
<br/>
---
# 5. DEMO

In [None]:
X = data[data.columns[1:]].values
y = data.failures.values

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X, y)

# Get the predicted values
y_pred = clf.predict(X)

# How good is it? Lets check the accuracy
accuracy = accuracy_score(y, y_pred)

print 'Accuracy: {}'.format(accuracy)

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(50, 20), max_iter=1000)
nn_clf.fit(X, y)

# Get the predicted values
y_pred = nn_clf.predict(X)

# How good is it? Lets check the accuracy
accuracy = accuracy_score(y, y_pred)

print 'Accuracy: {}'.format(accuracy)