In [768]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn import svm

In [None]:
diabetes = pd.read_csv('/Users/ruyuliu/Desktop/DeepLearningSELF/PYTORCH_NOTEBOOKS/Assignment1/diabetes.csv')
diabetes.head()

In [None]:
cont_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 
             'SkinThickness', 'Insulin','BMI', 
             'DiabetesPedigreeFunction', 'Age']
y_col = ['Outcome']  # this column contains the labels

# Convert continuous variables to a tensor
conts = np.stack([diabetes[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)

# Convert labels to a tensor
y = torch.tensor(diabetes[y_col].values).flatten()

In [None]:
class TabularModel(nn.Module):
    
    def __init__(self,n_cont,out_sz,layers,p=0.5):
        
        # normalize continuous data that falls in the same magnitude
        super().__init__()
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_in = n_cont
        
        # set up layers, layer = how many layers you want
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
            
        # the last layer with output size    
        layerlist.append(nn.Linear(layers[-1],out_sz))
        
        # We'll combine the list of layers with torch.nn.Sequential()
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cont):
        
        x_cont = self.bn_cont(x_cont)
        x_cont = self.layers(x_cont)
        
        return x_cont

In [None]:
torch.manual_seed(33)
model = TabularModel(conts.shape[1], 2, [256],0.4) 
model

In [None]:
batch_size = 768
test_size = int(batch_size*0.2)


con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
import time
start_time = time.time()

epochs = 1500
losses = []

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=0.5, last_epoch=-1)

lr_list_1 = []
for epoch in range(100):
    scheduler.step()
    lr_list_1.append(optimizer.state_dict()['param_groups'][0]['lr'])
plt.plot(range(100), lr_list_1, color='y', label='lr')
plt.legend()
plt.show()


for i in range(epochs):
    i+=1
    y_pred = model(con_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss.item())
    
    # a neat trick to save screen space:
    if i%50 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');

In [None]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(con_test)
    loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')

In [None]:
rows = 153
y_valuation = []
correct = 0
print(f'{"MODEL OUTPUT":26} ARGMAX  Y_TEST')
for i in range(rows):
    print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_test[i]:^7}')
    y_valuation.append(y_val[i].argmax().item())
    if y_val[i].argmax().item() == y_test[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')

In [None]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test.numpy(), y_valuation))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test.numpy(), y_valuation))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test.numpy(), y_valuation))

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)


# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
# KNN 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(10,7))
fig.tight_layout()

plots = [(0,1),(0,2),(0,3),(0,4),(0,5),(0,6),(1,2),(1,3),(1,4),(1,5),(1,6),(2,3),(2,4),(2,5),(2,6),(3,4)]
colors = ['b', 'r']
labels = ['no','yes']

for i, ax in enumerate(axes.flat):
    for j in range(2):
        x = diabetes.columns[plots[i][0]]
        y = diabetes.columns[plots[i][1]]
        ax.scatter(diabetes[diabetes['Outcome']==j][x], diabetes[diabetes['Outcome']==j][y], color=colors[j])
        ax.set(xlabel=x, ylabel=y)

fig.legend(labels=labels, loc=3, bbox_to_anchor=(1.0,0.85))
plt.show()