# Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
from IPython.display import display
params = {'legend.fontsize': 16,
          'legend.handlelength': 2,
          'figure.figsize': (12,9),
          'axes.titlesize': 16,
          'axes.labelsize': 16
         }
plt.rcParams.update(params)
# Keras models and layers
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
# Convenience Libraries
from functools import partial
from sklearn.metrics import roc_curve, auc

In [None]:
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, download_plotlyjs
import plotly.graph_objs as go
import cufflinks as cf
init_notebook_mode(connected=True)
# plotly.tools.set_credentials_file(username='vkrishnamani', api_key='uTN0DvhXNYXtzrrmwwpG')
cf.set_config_file(offline=True, world_readable=True, theme='pearl')

# Import MNIST hand-written digits database

In [None]:
# the data is import as a split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Check the priors for the 10 categories.

In [None]:
y_df = pd.DataFrame(zip(y_train, y_train), columns=['x', 'y']).groupby('x').count().reset_index()
data  = go.Data([go.Bar(y = y_df.y,
                        x = y_df.x,
                        orientation='v'
                       )])
layout = go.Layout(
    xaxis=dict(
        title='Classes',
    ),
    yaxis=dict(
        title='Count',
    )
)
fig  = go.Figure(data=data, layout=layout)
py.iplot(fig)

# Check the data shape and visualize it as an image

In [None]:
print "Shape of training set: ", x_train.shape

the images are provided as `(28 x 28)` px image and there are 60000 samples in the training set

In [None]:
plot_gray_image = partial(plt.imshow, cmap='gray')
plot_gray_image(x_train[0])

### Reshape `28 x 28 x` px image by flattening to an array of 784 and converting the pixel values to float.

In [None]:
x_train = x_train.reshape(x_train.shape[0], 784)
x_test = x_test.reshape(x_test.shape[0], 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print "Shape of training set after reshaping:", x_train.shape

### Normalize the pixel values between 0 and 1

In [None]:
x_train /= 255.0
x_test /= 255.0

### Convert `y` categorical values to one-hot vector

In [None]:
num_classes = 10
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# Building the model

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(x_train.shape[1],), name="Hidden1"))
model.add(Dense(num_classes, activation='softmax', name='Softmax_output'))
model.summary()

# Compile model

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train model

In [None]:
history = model.fit(x_train, y_train,
                    validation_split=0.1,
                    batch_size=100,
                    epochs=5,
                    verbose=1)

# Plot training and validation accuracy

In [None]:
pd.DataFrame.from_dict(history.history)[['acc', 'val_acc']].plot()

# Evalulate 

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Statistics

In [None]:
y_pred = model.predict(x_test)
stats_list = []
for i in range(10):
    # Calculate ROC Curve
    fpr, tpr, _ = roc_curve(y_test[:, i], y_pred[:, i])
    # Calculate area under the curve
    roc_auc = [auc(fpr, tpr)] * len(fpr)
    classes = [i] * len(fpr)
    stats_list += zip(fpr, tpr, roc_auc, classes)
stats = pd.DataFrame(stats_list, columns=['fpr', 'tpr', 'auc', 'class'])

In [None]:
stats.head()

In [None]:
data = []
for key, grp in stats.groupby(['class']):
    trace = go.Scatter(x = grp.fpr,
                       y = grp.tpr,
                       name = 'class %d' % key)
    data.append(trace)
# Edit the layout
layout = dict(title = 'Receiver Operating Characterstic',
              xaxis = dict(title = 'False Positive Rate',
                           range = [0, 0.2]),
              yaxis = dict(title = 'True Positive Rate'))

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='styled-line')

In [None]:
stats.groupby('class').mean().auc

---
# Assignment

Do the following for all the below questions
```
Plot the ROC curves and display AUC for this modified dataset. Write short blurb on what changes do you observe in ROC and AUC.
```

0. Make the model overfit, where you observe training accuracy increase while your test/validation accuracy continuously decreases per epoch. You can change hyperparameters/parameters/model/data etc. Then add `dropout` layers to the model to see if it helps to prevent overfitting. Call this `model0`
1. Remove half the samples of one of the class (your choice) from the training set, thereby changing the priors, while keeping the all samples of that class "as is" in the test set. (Make sure that when you take out the image data `X_train` also remove the corresponding `y_train` targets). Call this model `model1` 
2. Now remove all the image data from a single class from the training set, while still keeping the class in the test dataset. Fit, evaluate, generate statistics on this model. Call this model `model2`.
3. Now remove all the samples from removed class in `step 2` from test dataset and then calculate the statistics. How does the new statistics differ from statistics in `step 2`?
4. Change the target in the output layer from one-hot encoded vector to representing them as integer values. NOTE: to sucessfully train, you will need to change the model. Write about what you changed and why the change was necessary? Call this `model3`.
5. Reduce the size of the dataset, while keeping the percentage of priors more-or-less constant. Call this `model4`

```
NOTE: For each question create a new section in the jupyter notebook.
```

# Extra Credit
1. Modify the model/do hyperparameter search to get the best accuracy for this dataset. You can get really creative with the model.