In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
glass_df = pd.read_csv("glass.csv")
concrete_df = pd.read_csv("Concrete_Data_Yeh.csv")
concrete_df["age"] = concrete_df["age"].astype(float)


In [3]:
def normalization(df): # min-max normalization
    normalized_df = df.copy(deep=True)  # create a new dataFrame independent from the original one.
    for column_name in normalized_df.columns:
        
        if column_name != "Type":
            column_max = normalized_df[column_name].max()
            column_min = normalized_df[column_name].min()
            for i in range(len(normalized_df[column_name])):
                normalized_df[column_name][i] = (normalized_df[column_name][i] - column_min) / (column_max - column_min)
            
    return normalized_df

In [4]:
normalized_glass_df = normalization(glass_df)
normalized_concrete_df = normalization(concrete_df)

In [5]:
def cross_validation(df): # k-fold cross validation with k=5

    indices = np.arange(df.shape[0])
    np.random.shuffle(indices)

    bucket_1 = []
    bucket_2 = []
    bucket_3 = []
    bucket_4 = []
    bucket_5 = []

    for index, item in enumerate(indices):          # 5erli sub arrayler olarak da ele alınabilir ama galiba en hızlısı bu
        if index % 5 == 0:
            bucket_1.append(item)
        elif index % 5 == 1:
            bucket_2.append(item)
        elif index % 5 == 2:
            bucket_3.append(item)
        elif index % 5 == 3:
            bucket_4.append(item)
        else:
            bucket_5.append(item)
    
    df_subset_1 = df.iloc[bucket_1,:]
    df_subset_2 = df.iloc[bucket_2,:]
    df_subset_3 = df.iloc[bucket_3,:]
    df_subset_4 = df.iloc[bucket_4,:]
    df_subset_5 = df.iloc[bucket_5,:]

    return [df_subset_1, df_subset_2, df_subset_3, df_subset_4, df_subset_5]

In [6]:
glass_folds = cross_validation(glass_df)
normalized_glass_folds = cross_validation(normalized_glass_df)

concrete_folds = cross_validation(concrete_df)
normalized_concrete_folds = cross_validation(normalized_concrete_df)

In [7]:
from math import sqrt
from statistics import mode
from statistics import StatisticsError

def knn_classification(train_df, test_df, weighted=False, k=3):
    dropped_train_df = train_df.drop(list(test_df.index.values)).to_numpy()
    test_df = test_df.to_numpy()
    correct_guesses = 0
    wrong_guesses = 0
    
    for i in range(test_df.shape[0]):
        test_row = test_df[i]
        list_of_distances = list()
        for j in range(dropped_train_df.shape[0]):
            train_row = dropped_train_df[j]
            difference = abs(test_row[:-1] - train_row[:-1])
            distance = np.sqrt(sum(np.square(difference)))
            list_of_distances.append([distance, train_row[-1]])
        list_of_distances.sort(key=lambda x: x[0]) # sort by euclidean distances ascending



        if weighted:
            freqs = [0 for i in range(20)]
            guess = -1
            for i in list_of_distances[:k]:
                try:
                    freqs[int(i[1])] += (1 / i[0])
                except ZeroDivisionError: # no distance between the test point and the training point
                    guess = freqs[int(i[1])]
                    break
            guess = np.argmax(freqs) if guess == -1 else guess


        else: # if uniform weighted
            neighbors = list()
            for i in range(k):
                neighbors.append(list_of_distances[i][1])
                try:
                    guess = mode(neighbors)
                except StatisticsError: # all values are distinct in the list
                    guess = neighbors[0]

        if test_row[-1] == guess:
            correct_guesses += 1
        else:
            wrong_guesses += 1

    accuracy = 100*correct_guesses/(correct_guesses + wrong_guesses)
    print("Accuracy for KNN with k={} is {:.2f}%".format(k, accuracy))
    return accuracy

knn_classification(glass_df, glass_folds[0], False, 1)


Accuracy for KNN with k=1 is 60.47%


60.46511627906977

In [8]:


def knn_regression(train_df, test_df, weighted=False, k=3):
    dropped_train_df = train_df.drop(list(test_df.index.values)).to_numpy()
    test_df = test_df.to_numpy()
    
    error = 0
    attempts = 0
    for i in range(test_df.shape[0]):
        test_row = test_df[i]
        list_of_distances = list()
        for j in range(dropped_train_df.shape[0]):
            train_row = dropped_train_df[j]
            difference = abs(test_row[:-1] - train_row[:-1])
            distance = np.sqrt(sum(np.square(difference)))
            list_of_distances.append([distance, train_row])

        list_of_distances.sort(key=lambda x: x[0]) # sort by euclidean distances ascending  


        if weighted:
            
            guess = 0
            total = 0
            weights = 0
            for item in list_of_distances[:k]:
                try:
                    if item[0] == 0:
                        val = 1
                    else:
                        val = item[0]
                    weight = 1/val
                    pull = item[1][-1] * weight
                    total += pull
                    weights += weight

                except ZeroDivisionError: # no distance between the test point and the training point
                    guess = test_row[-1]
                    weight = 0
                    break
            guess = total / weights if guess == 0 else guess
            # print("guess: ", guess)

            
        else: # not weighted
            guess = 0
            for item in list_of_distances[:k]:           
                guess += item[1][-1]
            
            guess /= k
        error += abs(guess - test_row[-1])
        attempts += 1

    mae = error/attempts
    
    print("Mean Absolute Error for KNN with k={}: {}".format(k, mae))
    return mae



knn_regression(concrete_df, concrete_folds[0], False, 3)



Mean Absolute Error for KNN with k=3: 7.096278317152106


7.096278317152106

In [9]:
concrete_folds[0]

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
133,286.3,200.9,0.0,144.7,11.2,1004.6,803.7,28.0,67.70
710,250.2,166.8,0.0,203.5,0.0,977.6,694.1,28.0,36.96
223,166.1,0.0,163.3,176.5,4.5,1058.6,780.1,100.0,33.54
97,375.0,93.8,0.0,126.6,23.4,852.1,992.6,7.0,45.70
243,229.7,0.0,118.2,195.2,6.1,1028.1,757.6,100.0,40.86
...,...,...,...,...,...,...,...,...,...
427,190.3,0.0,125.2,166.6,9.9,1079.0,798.9,14.0,19.42
365,214.9,53.8,121.9,155.6,9.6,1014.3,780.6,14.0,38.60
342,297.2,0.0,117.5,174.8,9.5,1022.8,753.5,56.0,51.96
656,200.0,133.0,0.0,192.0,0.0,965.4,806.2,3.0,11.41


In [10]:
from math import sqrt
from statistics import mode
from statistics import StatisticsError

def knn(train_df, test_df, k, weighted=False, op="classification"):
    dropped_train_df = train_df.drop(list(test_df.index.values))
    correct_guesses = 0
    wrong_guesses = 0

    error = 0
    attempts = 0
    for i in range(test_df.shape[0]):
        test_row = test_df.iloc[i]
        list_of_distances = list()
        for j in range(dropped_train_df.shape[0]):
            train_row = dropped_train_df.iloc[j]
            difference = abs(test_row.iloc[:-1] - train_row.iloc[:-1])
            distance = sqrt(difference.pow(2).sum())
            
            if op == "classification":
                list_of_distances.append([distance, train_row.iloc[-1]])
            else:
                list_of_distances.append([distance, train_row.iloc[1:-1]])

        list_of_distances.sort(key=lambda x: x[0]) # sort by euclidean distances ascending
        
        
        if weighted: 
            if op == "classification": # distance weighted voting
                freqs = [0 for i in range(20)]
                guess = -1
                for i in list_of_distances[:k]:
                    try:
                        freqs[int(i[1])] += (1 / i[0])
                    except ZeroDivisionError: # no distance between the test point and the training point
                        guess = freqs[int(i[1])]
                        break
                guess = np.argmax(freqs) if guess == -1 else guess
            else: # weighted average
                guess = 0
                total = 0
                weights = 0
                for i in list_of_distances[:k]:
                    try:
                        weight = 1/i[0]
                        pull = i[1][-1] * weight
                        total += pull
                        weights += weight
                    except ZeroDivisionError: # no distance between the test point and the training point
                        guess = test_row.iloc[-1]
                        break
                guess = total / weights if guess == 0 else guess
        else:
            if op == "classification":
                neighbors = list()
                for i in range(k):
                    neighbors.append(list_of_distances[i][1])
                try:
                    guess = mode(neighbors)
                except StatisticsError: # all values are distinct in the list
                    guess = neighbors[0]
            else:
                guess = 0
                for i in list_of_distances[:k]:
                    guess += i[1][-1]
                guess /= k

        if op == "classification":
            if test_row.iloc[-1] == guess:
                correct_guesses += 1
            else:
                wrong_guesses += 1
        else:
            error += abs(guess - test_row.iloc[-1])
            attempts += 1

    if op == "classification":        
        accuracy = 100*correct_guesses/(correct_guesses + wrong_guesses)
        print("Accuracy for KNN with k={} is {:.2f}%".format(k, accuracy))
        return accuracy
    else:
        mae = error/attempts
        print("Mean Absolute Error for KNN with k={}: {}".format(k, mae))
        return mae

In [11]:
k_vals = [1, 3, 5, 7, 9]

In [12]:
for k in k_vals:
    knn_classification_accuracies = list()
    print("KNN Accuracies for Glass Classification without min-max normalization, k={}".format(k))
    for index, i in enumerate(glass_folds):
        print("Fold #{}:".format(index+1))
        knn_classification_accuracies.append(knn_classification(glass_df, i, k=k))
    print("Average KNN Accuracy for Glass Classification without min-max normalization, k={}: {:.2f}%\n-----\n\n".format(k, sum(knn_classification_accuracies) / len(glass_folds)))

KNN Accuracies for Glass Classification without min-max normalization, k=1
Fold #1:
Accuracy for KNN with k=1 is 60.47%
Fold #2:
Accuracy for KNN with k=1 is 69.77%
Fold #3:
Accuracy for KNN with k=1 is 83.72%
Fold #4:
Accuracy for KNN with k=1 is 69.77%
Fold #5:
Accuracy for KNN with k=1 is 71.43%
Average KNN Accuracy for Glass Classification without min-max normalization, k=1: 71.03%
-----


KNN Accuracies for Glass Classification without min-max normalization, k=3
Fold #1:
Accuracy for KNN with k=3 is 60.47%
Fold #2:
Accuracy for KNN with k=3 is 67.44%
Fold #3:
Accuracy for KNN with k=3 is 79.07%
Fold #4:
Accuracy for KNN with k=3 is 62.79%
Fold #5:
Accuracy for KNN with k=3 is 69.05%
Average KNN Accuracy for Glass Classification without min-max normalization, k=3: 67.76%
-----


KNN Accuracies for Glass Classification without min-max normalization, k=5
Fold #1:
Accuracy for KNN with k=5 is 62.79%
Fold #2:
Accuracy for KNN with k=5 is 58.14%
Fold #3:
Accuracy for KNN with k=5 is 72.

In [13]:
for k in k_vals:
    knn_classification_accuracies = list()
    print("KNN Accuracies for Glass Classification with min-max normalization, k={}".format(k))
    for index, i in enumerate(normalized_glass_folds):
        print("Fold #{}:".format(index+1))
        knn_classification_accuracies.append(knn_classification(normalized_glass_df, i, k=k))
    print("Average KNN Accuracy for Glass Classification with min-max normalization, k={}: {:.2f}%\n-----\n\n".format(k, sum(knn_classification_accuracies) / len(normalized_glass_folds)))

KNN Accuracies for Glass Classification with min-max normalization, k=1
Fold #1:
Accuracy for KNN with k=1 is 67.44%
Fold #2:
Accuracy for KNN with k=1 is 62.79%
Fold #3:
Accuracy for KNN with k=1 is 62.79%
Fold #4:
Accuracy for KNN with k=1 is 76.74%
Fold #5:
Accuracy for KNN with k=1 is 64.29%
Average KNN Accuracy for Glass Classification with min-max normalization, k=1: 66.81%
-----


KNN Accuracies for Glass Classification with min-max normalization, k=3
Fold #1:
Accuracy for KNN with k=3 is 67.44%
Fold #2:
Accuracy for KNN with k=3 is 60.47%
Fold #3:
Accuracy for KNN with k=3 is 58.14%
Fold #4:
Accuracy for KNN with k=3 is 81.40%
Fold #5:
Accuracy for KNN with k=3 is 69.05%
Average KNN Accuracy for Glass Classification with min-max normalization, k=3: 67.30%
-----


KNN Accuracies for Glass Classification with min-max normalization, k=5
Fold #1:
Accuracy for KNN with k=5 is 67.44%
Fold #2:
Accuracy for KNN with k=5 is 58.14%
Fold #3:
Accuracy for KNN with k=5 is 65.12%
Fold #4:
Ac

In [14]:
for k in k_vals:
    knn_classification_accuracies = list()
    print("Weighted KNN Accuracies for Glass Classification without min-max normalization, k={}".format(k))
    for index, i in enumerate(glass_folds):
        print("Fold #{}:".format(index+1))
        knn_classification_accuracies.append(knn_classification(glass_df, i, weighted=True, k=k))
    print("Average Weighted KNN Accuracy for Glass Classification without min-max normalization, k={}: {:.2f}%\n-----\n\n".format(k, sum(knn_classification_accuracies) / len(glass_folds)))

Weighted KNN Accuracies for Glass Classification without min-max normalization, k=1
Fold #1:
Accuracy for KNN with k=1 is 60.47%
Fold #2:
Accuracy for KNN with k=1 is 69.77%
Fold #3:
Accuracy for KNN with k=1 is 83.72%
Fold #4:
Accuracy for KNN with k=1 is 69.77%
Fold #5:
Accuracy for KNN with k=1 is 71.43%
Average Weighted KNN Accuracy for Glass Classification without min-max normalization, k=1: 71.03%
-----


Weighted KNN Accuracies for Glass Classification without min-max normalization, k=3
Fold #1:
Accuracy for KNN with k=3 is 55.81%
Fold #2:
Accuracy for KNN with k=3 is 65.12%
Fold #3:
Accuracy for KNN with k=3 is 79.07%
Fold #4:
Accuracy for KNN with k=3 is 62.79%
Fold #5:
Accuracy for KNN with k=3 is 69.05%
Average Weighted KNN Accuracy for Glass Classification without min-max normalization, k=3: 66.37%
-----


Weighted KNN Accuracies for Glass Classification without min-max normalization, k=5
Fold #1:
Accuracy for KNN with k=5 is 60.47%
Fold #2:
Accuracy for KNN with k=5 is 60.

In [15]:
for k in k_vals:
    knn_classification_accuracies = list()
    print("Weighted KNN Accuracies for Glass Classification with min-max normalization, k={}".format(k))
    for index, i in enumerate(normalized_glass_folds):
        print("Fold #{}:".format(index+1))
        knn_classification_accuracies.append(knn_classification(normalized_glass_df, i, k=k, weighted=True))
    print("Average Weighted KNN Accuracy for Glass Classification with min-max normalization, k={}: {:.2f}%\n-----\n\n".format(k, sum(knn_classification_accuracies) / len(normalized_glass_folds)))

Weighted KNN Accuracies for Glass Classification with min-max normalization, k=1
Fold #1:
Accuracy for KNN with k=1 is 67.44%
Fold #2:
Accuracy for KNN with k=1 is 62.79%
Fold #3:
Accuracy for KNN with k=1 is 62.79%
Fold #4:
Accuracy for KNN with k=1 is 76.74%
Fold #5:
Accuracy for KNN with k=1 is 64.29%
Average Weighted KNN Accuracy for Glass Classification with min-max normalization, k=1: 66.81%
-----


Weighted KNN Accuracies for Glass Classification with min-max normalization, k=3
Fold #1:
Accuracy for KNN with k=3 is 67.44%
Fold #2:
Accuracy for KNN with k=3 is 62.79%
Fold #3:
Accuracy for KNN with k=3 is 58.14%
Fold #4:
Accuracy for KNN with k=3 is 81.40%
Fold #5:
Accuracy for KNN with k=3 is 71.43%
Average Weighted KNN Accuracy for Glass Classification with min-max normalization, k=3: 68.24%
-----


Weighted KNN Accuracies for Glass Classification with min-max normalization, k=5
Fold #1:
Accuracy for KNN with k=5 is 67.44%
Fold #2:
Accuracy for KNN with k=5 is 62.79%
Fold #3:
Ac

In [16]:
for k in k_vals:
    mae_list = list()
    print("KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k={}".format(k))
    for index, i in enumerate(concrete_folds):
        print("Fold #{}:".format(index+1))
        mae_list.append(knn_regression(concrete_df, i, k=k))
    print("Average KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k={}: {:.2f}".format(k, sum(mae_list) / len(concrete_folds)))

KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=1
Fold #1:
Mean Absolute Error for KNN with k=1: 6.4492233009708695
Fold #2:
Mean Absolute Error for KNN with k=1: 6.494174757281551
Fold #3:
Mean Absolute Error for KNN with k=1: 6.61470873786408
Fold #4:
Mean Absolute Error for KNN with k=1: 6.034417475728156
Fold #5:
Mean Absolute Error for KNN with k=1: 6.485922330097087
Average KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=1: 6.42
KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=3
Fold #1:
Mean Absolute Error for KNN with k=3: 7.096278317152106
Fold #2:
Mean Absolute Error for KNN with k=3: 7.0896925566343025
Fold #3:
Mean Absolute Error for KNN with k=3: 6.880550161812296
Fold #4:
Mean Absolute Error for KNN with k=3: 5.940420711974113
Fold #5:
Mean Absolute Error for KNN with k=3: 6.205404530744334
Average KNN Mean Absolute Error for Concrete Strength Estim

In [20]:
for k in k_vals:
    mae_list = list()
    print("KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k={}".format(k))
    for index, i in enumerate(normalized_concrete_folds):
        print("Fold #{}:".format(index+1))
        mae_list.append(knn_regression(normalized_concrete_df, i, k=k))
    print("Average KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k={}: {:.2f}".format(k, sum(mae_list) / len(normalized_concrete_folds)))

KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=1
Fold #1:
Mean Absolute Error for KNN with k=1: 0.07847241288805619
Fold #2:
Mean Absolute Error for KNN with k=1: 0.08745967795583107
Fold #3:
Mean Absolute Error for KNN with k=1: 0.08895705150457019
Fold #4:
Mean Absolute Error for KNN with k=1: 0.08198120179346167
Fold #5:
Mean Absolute Error for KNN with k=1: 0.0886583025009041
Average KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=1: 0.09
KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=3
Fold #1:
Mean Absolute Error for KNN with k=3: 0.08418271182654978
Fold #2:
Mean Absolute Error for KNN with k=3: 0.08259946305813348
Fold #3:
Mean Absolute Error for KNN with k=3: 0.08836318202764697
Fold #4:
Mean Absolute Error for KNN with k=3: 0.08204893436109437
Fold #5:
Mean Absolute Error for KNN with k=3: 0.08995207517669934
Average KNN Mean Absolute Error for Concrete Stren

In [21]:
for k in k_vals:
    mae_list = list()
    print("Weighted KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k={}".format(k))
    for index, i in enumerate(concrete_folds):
        print("Fold #{}:".format(index+1))
        mae_list.append(knn_regression(concrete_df, i, k=k, weighted=True))
    print("Average Weighted KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k={}: {:.2f}".format(k, sum(mae_list) / len(concrete_folds)))

Weighted KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=1
Fold #1:
Mean Absolute Error for KNN with k=1: 6.4492233009708695
Fold #2:
Mean Absolute Error for KNN with k=1: 6.494174757281553
Fold #3:
Mean Absolute Error for KNN with k=1: 6.61470873786408
Fold #4:
Mean Absolute Error for KNN with k=1: 6.034417475728157
Fold #5:
Mean Absolute Error for KNN with k=1: 6.485922330097088
Average Weighted KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=1: 6.42
Weighted KNN Mean Absolute Error for Concrete Strength Estimation without min-max normalization, k=3
Fold #1:
Mean Absolute Error for KNN with k=3: 5.870810301241852
Fold #2:
Mean Absolute Error for KNN with k=3: 6.023926463276856
Fold #3:
Mean Absolute Error for KNN with k=3: 5.653263599236023
Fold #4:
Mean Absolute Error for KNN with k=3: 5.207232334906761
Fold #5:
Mean Absolute Error for KNN with k=3: 5.183949921063675
Average Weighted KNN Mean Absolut

In [22]:
for k in k_vals:
    mae_list = list()
    print("Weighted KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k={}".format(k))
    for index, i in enumerate(normalized_concrete_folds):
        print("Fold #{}:".format(index+1))
        mae_list.append(knn_regression(normalized_concrete_df, i, k=k, weighted=True))
    print("Average Weighted KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k={}: {:.2f}".format(k, sum(mae_list) / len(normalized_concrete_folds)))

Weighted KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=1
Fold #1:
Mean Absolute Error for KNN with k=1: 0.07847241288805619
Fold #2:
Mean Absolute Error for KNN with k=1: 0.08745967795583107
Fold #3:
Mean Absolute Error for KNN with k=1: 0.08895705150457019
Fold #4:
Mean Absolute Error for KNN with k=1: 0.08198120179346167
Fold #5:
Mean Absolute Error for KNN with k=1: 0.0886583025009041
Average Weighted KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=1: 0.09
Weighted KNN Mean Absolute Error for Concrete Strength Estimation with min-max normalization, k=3
Fold #1:
Mean Absolute Error for KNN with k=3: 0.07395215784882497
Fold #2:
Mean Absolute Error for KNN with k=3: 0.07763995425161115
Fold #3:
Mean Absolute Error for KNN with k=3: 0.07876425994728875
Fold #4:
Mean Absolute Error for KNN with k=3: 0.07466680998818515
Fold #5:
Mean Absolute Error for KNN with k=3: 0.08411407518465046
Average Weighted KNN Me