# Module 4 Unit 2
## Fitting neural networks in Python

### Classification problem

In [None]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

Now that the required libraries have been imported, it is time to import and clean the data.

In [None]:
# Import data
df = pd.read_csv('WISDM_transformed.csv', delimiter = ",")

# Drop missing entries
df = df.replace("?", np.nan)
print("Number of null values in the data set = %s" % df.isnull().sum().sum())
df = df.dropna()

In [None]:
# Explore the size of the data set
df.shape

In [None]:
# Explore type of data and feature names
# You can use df.head() or df.tail() to see the first or last few records, respectively.
# However, if data is sorted, it is better to view some randomly sampled records
df.sample(10, random_state=0)

In [None]:
# Display all the unique values in the class column
print(df['class'].unique())

For the features, choose all the columns except the `id`, `UNIQUE_ID`,`user`, and `class` variables. The `class` variable will be the response variable.

In [None]:
# Split data into features (X) and response (y)
X = df.iloc[:, 3:46] 
y = df.loc[:,["class"]]

In [None]:
# Change the array shape of the output from a dataframe single column vector
# to a contiguous flattened array
y = np.ravel(y)

In [None]:
# Split the data into the training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Note that the features have widely varying dimensions: X0 is a proportion between 0 and 1;
YPeak can be in the thousands and ZSTANDDEV is a standard deviation. 
Since neural networks are much more sensitive to features with high (absolute) values,
you should scale all your input data to have a mean of 0 and standard deviation of 1.

In [None]:
# Scale the data
scaler = StandardScaler()  

# Remember to fit using only the training data
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  

# Apply the same transformation to test data
X_test = scaler.transform(X_test)

Next, you fit a two-layer MLP classifier with 5 nodes on each layer.

In [None]:
reg = MLPClassifier(max_iter=2000, hidden_layer_sizes=(5,5), random_state=1)
reg.fit(X_train, y_train)

# Predict
y_pred = reg.predict(X_test)
    
# Accuracy before model parameter optimisation
accuracy_score(y_pred,y_test)

Can you get a better accuracy? More nodes on each layer could lead to overfitting.
Fewer nodes could make the model too sparse. 
Use cross-validation to find the optimal number of nodes per layer.

In [None]:
# Fit and check accuracy for various numbers of nodes on both layers
# Note this will take some time
validation_scores = {}
print("Nodes |Validation")
print("      | score")

for hidden_layer_size in [(i,j) for i in range(3,7) for j in range(3,7)]:

    reg = MLPClassifier(max_iter=2000, hidden_layer_sizes=hidden_layer_size, random_state=1)

    score = cross_val_score(estimator=reg, X=X_train, y=y_train, cv=2)
    validation_scores[hidden_layer_size] = score.mean()
    print(hidden_layer_size, ": %0.5f" % validation_scores[hidden_layer_size])

In [None]:
# Vizualise these using a 3D surface plot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter

fig = plt.figure()
ax = fig.gca(projection='3d')

# Prepare the data
px, py = np.meshgrid(np.arange(3,7), np.arange(3,7))
pz = np.array([[validation_scores[(i,j)] for i in range(3,7)] for j in range(3,7)])

# Customize the z-axis
ax.set_zlim(0.76, .81)

# Plot the surface
surf = ax.plot_surface(px, py, pz)
plt.show()

In [None]:
# Check scores
print("The highest validation score is: %0.4f" % max(validation_scores.values()))  
optimal_hidden_layer_size = [name for name, score in validation_scores.items() 
                              if score==max(validation_scores.values())][0]
print("This corresponds to nodes", optimal_hidden_layer_size )

In [None]:
# Fit data with best parameter
clf = MLPClassifier(max_iter=2000, 
                    hidden_layer_sizes=optimal_hidden_layer_size, 
                    random_state=1)
clf.fit(X_train, y_train)
# Does not converge fully without changing max_iter

In [None]:
# Predict
y_pred = clf.predict(X_test)

# Accuracy 
accuracy_score(y_pred,y_test)

In [None]:
# Draw a response function to observe response vs resultant 

# Copy dataframe so as to not change original, and obtain medians
X_design = X.copy()
X_design_vec = pd.DataFrame(X_design.median()).transpose()

# View X_design_vec
X_design_vec.head()

# Find the min and max of the desired feature and set up a sequence
min_resultant = min(X.loc[:,"RESULTANT"])
max_resultant = max(X.loc[:,"RESULTANT"])
seq = np.linspace(start=min_resultant,stop=max_resultant,num=50)

# Set up a list of moving resultants
to_predict = []
for result in seq:
    X_design_vec.loc[0,"RESULTANT"] = result
    to_predict.append(X_design_vec.copy())

# Convert back to dataframe
to_predict = pd.concat(to_predict)

# Scale and predict
to_predict = scaler.transform(to_predict)
predictions = clf.predict(to_predict)

# Plot 
plt.plot(seq,predictions)
plt.xlabel("Resultant")
plt.ylabel("Class")
plt.title("Response vs Resultant")
plt.show()

The `RESULTANT`, which is the average resultant acceleration, is plotted against selected classes. Does this make intuitive sense?

Continue to the small group discussion to discuss the outcomes of this model.