# ASSIGNMENT 1 - CSE 574 - INTRODUCTION TO MACHINE LEARNING

__PART 1 - Logistic Regression__

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#read csv file
data=pd.read_csv("diabetes.csv")
data.head()
x=data[data.columns[0:-1]]
y=data[data.columns[-1]]

#Splitting dataset into training, test and validation sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=42)
x_val,x_test,y_val,y_test=train_test_split(x_test,y_test,test_size=0.5,random_state=42)
print(x_train.shape,x_test.shape,x_val.shape,y_train.shape,y_test.shape,y_val.shape)

#retriving values and storing them 
x_train=x_train.values
y_train=y_train.values
x_test=x_test.values
y_test=y_test.values
x_val=x_val.values
y_val=y_val.values


#reshaping the training, test and validation arrays as required
x_train=x_train.transpose()
y_train=y_train.reshape(1,x_train.shape[1])
x_test=x_test.transpose()
y_test=y_test.reshape(1,x_test.shape[1])
x_val=x_val.transpose()
y_val=y_val.reshape(1,x_val.shape[1])

#checking the shape of all input arrays
print(x_train.shape,x_test.shape,x_val.shape,y_train.shape,y_test.shape,y_val.shape)


def sigmoid(x):
    return 1/(1 + np.exp(-x))
    

def model(x, y, learning_rate, iterations):
    m=x_train.shape[1]
    n=x_train.shape[0]
    
    W=np.zeros((n,1))
    B=0
    cost_list = []
    for i in range(iterations):
        Z=np.dot(W.transpose(),x)+B
        A=sigmoid(Z)        
        
        #Cost Function
        cost=-(1/m)*np.sum(y*np.log(A)+(1-y)*np.log(1-A))
        
        #Gradient Descent
        dW=(1/m)*np.dot((A-y),x.transpose())
        dB=(1/m)*np.sum(A-y)
        
        W=W-learning_rate*dW.transpose()
        B=B-learning_rate*dB
        
        cost_list.append(cost)
        
        if(i%(iterations/10)==0):
            print("cost after ",i," iterations is: ",cost)
            #learning_rate=learning_rate-learning_rate/55
    return W,B,cost_list

iterations=1000000
learning_rate= 0.00023

W,B,cost_list=model(x_train,y_train,learning_rate=learning_rate,iterations=iterations)

plt.plot(np.arange(iterations), cost_list)
plt.show()

def accuracy(X, Y, W, B):
    
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)
    
    A = A > 0.5
    
    A = np.array(A, dtype = 'int64')
    
    acc = (1 - np.sum(np.absolute(A - Y))/Y.shape[1])*100
    print(round(acc, 2), "%")

print("The accuracy of training dataset is: ")
accuracy(x_train, y_train, W, B)
print("The accuracy of validation dataset is: ")
accuracy(x_val, y_val, W, B)
print("The accuracy of test dataset is: ")
accuracy(x_test, y_test, W, B)
print(W)
print(B)

__PART 2 - Neural Networks__

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt

data=pd.read_csv("diabetes.csv")
properties = list(data.columns.values)
properties.remove('Outcome')
X = data[properties]
y = data['Outcome']

#Splitting dataset into training, test and validation sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=42)
X_val,X_test,y_val,y_test=train_test_split(X_test,y_test,test_size=0.5,random_state=42)
print(X_train.shape,X_test.shape,X_val.shape,y_train.shape,y_test.shape,y_val.shape)
    
#retriving values and storing them 
X_train=X_train.values
y_train=y_train.values
X_test=X_test.values
y_test=y_test.values
X_val=X_val.values
y_val=y_val.values

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(8,)),
    keras.layers.Dense(12, kernel_regularizer=regularizers.L1(0.001), bias_regularizer=regularizers.L1(0.001), activation=tf.nn.relu),
	keras.layers.Dense(1, kernel_regularizer=regularizers.L1(0.001), bias_regularizer=regularizers.L1(0.001), activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history=model.fit(X_train, y_train, epochs=100, batch_size=1, validation_data=(X_val, y_val))

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

train_loss, train_acc = model.evaluate(X_val, y_val)
print("\n Training set Accuracy:", train_acc)
print("\n Training set loss:", train_loss)

val_loss, val_acc = model.evaluate(X_val, y_val)
print("\n Validation Accuracy:", val_acc)
print("\n Validation loss:", val_loss)

loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1,101)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='Validation Loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_train = history.history['accuracy']
acc_val = history.history['val_accuracy']
epochs = range(1,101)
plt.plot(epochs, acc_train, 'g', label='Training Accuracy')
plt.plot(epochs, acc_val, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

__PART 3 - Implementing Dropout Regularization__