# Data Visualization and manipulations

In [None]:
import sys
import numpy as np
from numpy import set_printoptions
set_printoptions(precision=3)
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16,12)
import seaborn as sns
from IPython.display import display

# Load diabetes data from CSV file

In [None]:
filename = "../data/pima-indians-diabetes.data.csv"
columns = ['pregnant', 'plasma_glucose', 'blood_pressure', 'skin_fold', 'serum_insulin', 
           'bmi', 'pedigree', 'age', 'class']
data = pd.read_csv(filename, names=columns)

In [None]:
display(data.head())
display(data.sample(5, random_state=1))
print "Data Rows: {}, Cols: {}".format(data.shape[0], data.shape[1])

In [None]:
data.info()

In [None]:
data.describe()

#  `groupby` operation to consolidate columns

In [None]:
display(data.groupby('class').sum())
display(data.groupby('class').mean())

# `apply` to manipulate columns

You can pass in any `function`, Return type depends on whether passed function aggregates

In [None]:
display(data.apply(np.mean))
display(data.apply(np.sin).head())

# Correlation

In [None]:
corr = data.corr(method='pearson')
corr

In [None]:
sns.heatmap(corr, cmap=sns.cubehelix_palette(as_cmap=True), annot=True)

# Skewness

In [None]:
data.skew()

In [None]:
# Right Skew
sns.distplot(data['pedigree'])

In [None]:
# Left Skew
sns.distplot(data['blood_pressure'])

In [None]:
sns.pairplot(data[['plasma_glucose', 'blood_pressure', 'serum_insulin', 'class']], hue="class")

# Feature Selections

### Univariate Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
array = data.values
X = array[:,0:8]
Y = array[:,8]
test = SelectKBest(score_func=chi2, k=3)
fit = test.fit(X, Y)
print(fit.scores_)

### Recursive Feature Elimination

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:
model = LogisticRegression() # Model does not matter much
# Select top 5 features
rfe = RFE(model, 5)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

### Feature importance using Random forest

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

# Statistical Learning Techniques

### Extratrees Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)
# model = LogisticRegression()
model = ExtraTreesClassifier(max_depth=100)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))

---
# Basic Neural Network

## Cross-entropy Loss or Negative Log Likelihood

$$-\frac{1}{N}\sum_{n \epsilon N}\sum_{i \epsilon C} y_{n,i} log \hat{y}_{n,i}$$


### Gradients

---

$$\delta_{2} = \hat{y} - y$$

$$\delta_{1} = \delta_{2} W_{2}^{T}\odot a_{1} \odot (1 - a_{1})$$

---

$$ \frac{\delta L}{\delta W_{2}} = a^{T}_{1} \delta_{2}$$

$$ \frac{\delta L}{\delta b_{2}} = \delta_{2}$$

$$ \frac{\delta L}{\delta W_{1}} = x^{T} \delta_{1}$$

$$ \frac{\delta L}{\delta b_{2}} = \delta_{1}$$

In [None]:
class NeuralNetwork(object):
    
    def __init__(self, n_features=10, n_output=10, n_hidden=100, 
                 learning_rate=0.001, reg_lambda=None):
        
        ### Network Dimensions
        self.n_output = n_output
        self.n_features = n_features
        self.n_hidden = n_hidden
        
        ### Initialize weights
        self.w_hid = np.random.randn(self.n_features, self.n_hidden)
        self.b_hid = np.random.randn(self.n_hidden)
        self.w_out = np.random.randn(self.n_hidden, self.n_output)
        self.b_out = np.random.randn(self.n_output)
        
        ### Hyper parameters
        self.learning_rate = learning_rate
        self.reg_lambda = reg_lambda
        
    def _one_hotize(self, y, k):
        onehot = np.zeros((k, y.shape[0]))
        for idx, val in enumerate(y):
            onehot[val, idx] = 1.0
        return onehot
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def _softmax(self, x):
        exp_x = np.exp(x)
        return exp_x / exp_x.sum(axis=1, keepdims=True)
    
    def train(self, input_list, target_list):
        ### Convert inputs list to 2d array
        inputs = np.array(input_list, ndmin=2)
        targets = np.array(target_list, ndmin=2)
        targets = self._one_hotize(targets, self.n_output).T
        
        ### Forward 
        A = self._sigmoid(inputs.dot(self.w_hid) + self.b_hid)
        Y = self._softmax(A.dot(self.w_out) + self.b_out)
        
        ### Calculate Loss ###
        loss = np.sum(targets * np.log(Y))/len(targets)
        
        ### Backward
        # Total Error
        delta2 = Y - targets
        delta1 = delta2.dot(self.w_out.T) * A * (1 - A)
        
        dw_out = A.T.dot(delta2)
        db_out = delta2.sum(axis=0)
        dw_hid = inputs.T.dot(delta1)
        db_hid = delta1.sum(axis=0)
        
        ### Add L2 regularization terms (b1 and b2 don't have regularization terms)
        if reg_lambda:
            dw_out += self.reg_lambda * self.w_out
            dw_hid += self.reg_lambda * self.w_hid
        
        ### Update Weights
        self.w_hid -= self.learning_rate * dw_hid
        self.b_hid -= self.learning_rate * db_hid
        self.w_out -= self.learning_rate * dw_out
        self.b_out -= self.learning_rate * db_out
        
        return loss

    
    def inference(self, input_list):
        inputs = np.array(input_list, ndmin=2)
        A = self._sigmoid(inputs.dot(self.w_hid) + self.b_hid)
        Y = self._softmax(A.dot(self.w_out) + self.b_out)
        return Y
    
    def get_accuracy(self, input_list, target_labels):
        preds = np.argmax(self.inference(input_list), axis=1)
        accuracy = np.sum(target_labels == preds, axis=0) * 1.0/len(target_labels)
        return accuracy
    

In [None]:
### Load Data ###
filename = "../data/pima-indians-diabetes.data.csv"
columns = ['pregnant', 'plasma_glucose', 'blood_pressure', 'skin_fold', 'serum_insulin', 
           'bmi', 'pedigree', 'age', 'class']
data = pd.read_csv(filename, names=columns)

#### Normalize Data ###
feature_columns = ['pregnant', 'plasma_glucose', 'blood_pressure', 'skin_fold', 
                   'serum_insulin', 'bmi', 'pedigree', 'age']
scaled_features = {}
for col in feature_columns:
    mean, std = data[col].mean(), data[col].std()
    scaled_features[col] = [mean, std]
    data.loc[:, col] = (data[col] - mean)/std

#### Drop Less "Important" columns ###
data = data.drop(['pregnant', 'blood_pressure', 'skin_fold', 
                  'serum_insulin', 'pedigree'], axis=1)
    
### Train/Test Split ###
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

epochs = 1000
n_features = X_train.shape[1]
n_output = 2
n_hidden = 100
learning_rate = 0.001
reg_lambda = 0.001

### Stats ###
training_stats = {'training_acc': [], 'validation_acc': [], 'loss': []}

network = NeuralNetwork(n_features, n_output, n_hidden, learning_rate, reg_lambda)
for e in range(epochs):
    # Go through a random batch of 128 records from the training data set
    batch = np.random.choice(X_train.index, size=128)
    loss = 0
    n = 0
    for record, target in zip(X_train.ix[batch].values, y_train.ix[batch]['class']):
        loss += network.train(record, target)
        n += 1
    loss = loss/n
    training_stats['loss'].append(loss)

    # Printing out the training progress
    train_acc = network.get_accuracy(X_train, list(y_train['class']))
    val_acc = network.get_accuracy(X_test, list(y_test['class']))
    sys.stdout.write("\rProgress: " + str(100 * e/float(epochs))[:4] \
                     + "% ... Training acc: {}%".format(str(train_acc * 100)[:5]) \
                     + " ... Validation acc: {}%".format(str(val_acc * 100)[:5]))
    training_stats['training_acc'].append(train_acc)
    training_stats['validation_acc'].append(val_acc)
print "\n-------"

In [None]:
stats_df = pd.DataFrame.from_dict(training_stats)
display(stats_df.head())
stats_df[['training_acc', 'validation_acc']].plot()

# Now Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import numpy

In [None]:
def one_hotize(df):
    onehot = []
    for row in df.iterrows():
        cod = [0, 0]
        cod[row[1]['class']] = 1
        onehot.append(cod)
    onehot = np.array(onehot)
    return onehot

In [None]:
filename = "../data/pima-indians-diabetes.data.csv"
columns = ['pregnant', 'plasma_glucose', 'blood_pressure', 'skin_fold', 'serum_insulin', 
           'bmi', 'pedigree', 'age', 'class']
data = pd.read_csv(filename, names=columns)

#### Normalize Data ###
feature_columns = ['pregnant', 'plasma_glucose', 'blood_pressure', 'skin_fold', 
                   'serum_insulin', 'bmi', 'pedigree', 'age']
scaled_features = {}
for col in feature_columns:
    mean, std = data[col].mean(), data[col].std()
    scaled_features[col] = [mean, std]
    data.loc[:, col] = (data[col] - mean)/std

#### Drop Less "Important" columns ###
data = data.drop(['skin_fold', 'serum_insulin', 'age'], axis=1)

X = data.iloc[:,:-1]
Y = data.iloc[:,-1:]
Y_onehot = one_hotize(Y)
n_features = X.shape[1]
n_output = 2
n_hidden = 20

In [None]:

# create model
model = Sequential()
model.add(Dense(n_hidden, input_dim=n_features, init='uniform', activation='relu'))
model.add(Dense(n_hidden, init='uniform', activation='relu'))
model.add(Dense(n_output, init='uniform', activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
training = model.fit(X, Y_onehot, validation_split=0.20, epochs=100, batch_size=5, verbose=1)

In [None]:
pd.DataFrame.from_dict(training.history)[['acc', 'val_acc']].plot()