In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("C:/Users/byeongwan/OneDrive/desktop/archive/Iris.csv")

In [4]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


# k-NN

In [5]:
# Distances
def euclidian(p1, p2): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + np.square(p1[i]-p2[i])
    dist = np.sqrt(dist)
    return dist;

def manhattan(p1, p2): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + abs(p1[i]-p2[i])
    return dist;

def minkowski(p1, p2, q): 
    dist = 0
    for i in range(len(p1)):
        dist = dist + abs(p1[i]-p2[i])**q
    dist = np.sqrt(dist)**(1/q)
    return dist;

In [6]:
# kNN Function
def kNN(X_train,y_train, X_test, k, dist='euclidian',q=2):
    pred = []
    # Adjusting the data type
    if isinstance(X_test, np.ndarray):
        X_test=pd.DataFrame(X_test)
    if isinstance(X_train, np.ndarray):
        X_train=pd.DataFrame(X_train)
        
    for i in range(len(X_test)):    
        # Calculating distances for our test point
        newdist = np.zeros(len(y_train))

        if dist=='euclidian':
            for j in range(len(y_train)):
                newdist[j] = euclidian(X_train.iloc[j,:], X_test.iloc[i,:])
    
        if dist=='manhattan':
            for j in range(len(y_train)):
                newdist[j] = manhattan(X_train.iloc[j,:], X_test.iloc[i,:])
    
        if dist=='minkowski':
            for j in range(len(y_train)):
                newdist[j] = minkowski(X_train.iloc[j,:], X_test.iloc[i,:],q)

        # Merging actual labels with calculated distances
        newdist = np.array([newdist, y_train])

        ## Finding the closest k neighbors
        # Sorting index
        idx = np.argsort(newdist[0,:])

        # Sorting the all newdist
        newdist = newdist[:,idx]
        #print(newdist)

        # We should count neighbor labels and take the label which has max count
        # Define a dictionary for the counts
        c = {'0':0,'1':0,'2':0 }
        # Update counts in the dictionary 
        for j in range(k):
            c[str(int(newdist[1,j]))] = c[str(int(newdist[1,j]))] + 1

        key_max = max(c.keys(), key=(lambda k: c[k]))
        pred.append(int(key_max))
        
    return pred

# Logistic Regression

In [7]:
# Sigmoid Function 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [8]:
# Cost Function
def J(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

In [9]:
# Gradient Descent Function
def gradientdescent(X, y, lmd, alpha, num_iter, print_cost):

    # select initial values zero
    theta = np.zeros(X.shape[1])
    
    costs = []  
    
    for i in range(num_iter):
        z = np.dot(X, theta)
        h = sigmoid(z)
        
        # adding regularization 
        reg = lmd / y.size * theta
        # first theta is intercept
        # it is not regularized
        reg[0] = 0
        cost = J(h, y)
        
        gradient = np.dot(X.T, (h - y)) / y.size + reg
        theta = theta - alpha * gradient
    
        if print_cost and i % 100 == 0: 
            print('Number of Iterations: ', i, 'Cost : ', cost, 'Theta: ', theta)
        if i % 100 == 0:
            costs.append(cost)
      
    return theta, costs

In [10]:
# Predict Function 
def predict(X_test, theta):
    z = np.dot(X_test, theta)
    return sigmoid(z)

In [11]:
# Main Logistic Function
def logistic(X_train, y_train, X_test, lmd=0, alpha=0.1, num_iter=30000, print_cost = False):
    # Adding intercept
    intercept = np.ones((X_train.shape[0], 1))
    X_train = np.concatenate((intercept, X_train), axis=1)
    
    intercept = np.ones((X_test.shape[0], 1))
    X_test = np.concatenate((intercept, X_test), axis=1)

    # one vs rest
    u=set(y_train)
    t=[]
    allCosts=[]   
    for c in u:
        # set the labels to 0 and 1
        ynew = np.array(y_train == c, dtype = int)
        theta_onevsrest, costs_onevsrest = gradientdescent(X_train, ynew, lmd, alpha, num_iter, print_cost)
        t.append(theta_onevsrest)
        
        # Save costs
        allCosts.append(costs_onevsrest)
        
    # Calculate probabilties
    pred_test = np.zeros((len(u),len(X_test)))
    for i in range(len(u)):
        pred_test[i,:] = predict(X_test,t[i])
    
    # Select max probability
    prediction_test = np.argmax(pred_test, axis=0)
    
    # Calculate probabilties
    pred_train = np.zeros((len(u),len(X_train)))
    for i in range(len(u)):
        pred_train[i,:] = predict(X_train,t[i])
    
    # Select max probability
    prediction_train = np.argmax(pred_train, axis=0)
    
    d = {"costs": allCosts,
         "Y_prediction_test": prediction_test, 
         "Y_prediction_train" : prediction_train, 
         "learning_rate" : alpha,
         "num_iterations": num_iter,
         "lambda": lmd}
        
    return d

# Logistic Regression from Neural Network Perspective

In [12]:
# Sigmoid Function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Select initial values zero
def initialize_with_zeros(dim):
    return np.zeros((dim,1)), 0

In [None]:
def propagate(w, b, X, Y):
    m = X.shape[1]
    
    # FORWARD PROPAGATION (FROM X TO COST)
    A = sigmoid(np.dot(w.T,X)+b) # compute activation
    cost = -1/m*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A)) # compute cost
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = 1/m*np.dot(X,(A-Y).T)
    db = 1/m*np.sum(A-Y)
    
    # keep grads in a dictionary 
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

In [None]:
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):    
    costs = []
    
    for i in range(num_iterations):
        # Cost and gradient calculation
        grads, cost = propagate(w, b, X, Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w = w-learning_rate*dw
        b = b-learning_rate*db 
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
            
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    # Save pameters and gradients
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [None]:
def predict_nn(w, b, X):    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities
    A = sigmoid(np.dot(w.T,X)+b)
        
    return A

In [None]:
def model(X_train, Y_train, X_test, Y_test, num_iterations = 30000, learning_rate = 0.1, print_cost = False): 
    # pandas to numpy
    X_train = X_train.values
    Y_train = Y_train.values.reshape((1,Y_train.shape[0]))
    X_test = X_test.values
    Y_test = Y_test.values.reshape((1,Y_test.shape[0]))
    
    # take transpose of X
    X_train = X_train.T
    X_test = X_test.T
    
    # initialize parameters with zeros 
    w, b = initialize_with_zeros(X_train.shape[0])
    
    # one vs all
    u = set(y_train)
    param_w = []
    param_b = []
    allCosts = []
    for c in u:
        # set the labels to 0 and 1
        ynew = np.array(y_train == c, dtype = int)
        # Gradient descent 
        parameters, grads, costs = optimize(w, b, X_train, ynew, num_iterations, learning_rate, print_cost = print_cost)
        
        # Save costs
        allCosts.append(costs)
        
        # Retrieve parameters w and b from dictionary "parameters"
        param_w.append(parameters["w"])
        param_b.append(parameters["b"])
    
    # Calculate probabilties
    pred_test = np.zeros((len(u),X_test.shape[1]))
    for i in range(len(u)):
        pred_test[i,:] = predict_nn(param_w[i], param_b[i], X_test)
    
    # Select max probability
    Y_prediction_test = np.argmax(pred_test, axis=0)
    
    # Calculate probabilties
    pred_train = np.zeros((len(u),X_train.shape[1]))
    for i in range(len(u)):
        pred_train[i,:] = predict_nn(param_w[i], param_b[i], X_train)
    
    # Select max probability
    Y_prediction_train = np.argmax(pred_train, axis=0)
        
    d = {"costs": allCosts,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [13]:
def cross_validation_split(dataset, folds):
        dataset_split = []
        df_copy = dataset
        fold_size = int(df_copy.shape[0] / folds)
        
        # for loop to save each fold
        for i in range(folds):
            fold = []
            # while loop to add elements to the folds
            while len(fold) < fold_size:
                # select a random element
                r = randrange(df_copy.shape[0])
                # determine the index of this element 
                index = df_copy.index[r]
                # save the randomly selected line 
                fold.append(df_copy.loc[index].values.tolist())
                # delete the randomly selected line from
                # dataframe not to select again
                df_copy = df_copy.drop(index)
            # save the fold     
            dataset_split.append(np.asarray(fold))
            
        return dataset_split 

In [14]:
def kfoldCV(dataset, f=5, k=5, model="logistic"):
    data=cross_validation_split(dataset,f)
    result=[]
    # determine training and test sets 
    for i in range(f):
        r = list(range(f))
        r.pop(i)
        for j in r :
            if j == r[0]:
                cv = data[j]
            else:    
                cv=np.concatenate((cv,data[j]), axis=0)
        
        # apply the selected model
        # default is logistic regression
        if model == "logistic":
            # default: alpha=0.1, num_iter=30000
            # if you change alpha or num_iter, adjust the below line         
            c = logistic(cv[:,0:4],cv[:,4],data[i][:,0:4])
            test = c['Y_prediction_test']
        elif model == "knn":
            test = kNN(cv[:,0:4],cv[:,4],data[i][:,0:4],k)
            
        # calculate accuracy    
        acc=(test == data[i][:,4]).sum()
        result.append(acc/len(test))
        
    return result