# k-Nearest Neighbors Algorithm (k-NN)
> Name: Debatreya Das <br>
> Roll No. 12212070 <br>
> CS A4 <br>
> ML Lab 14

## Loading Dataset

In [5]:
import pandas as pd
# Load the climate dataset
data = pd.read_csv('kNN.csv')

# Inspect the data (first few rows and columns)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         2096 non-null   int64  
 1   Mo           2096 non-null   int64  
 2   Globe        2096 non-null   float64
 3   Land         2096 non-null   float64
 4   Ocean        2096 non-null   float64
 5   NH           2096 non-null   float64
 6   Land.1       2096 non-null   float64
 7   Ocean.1      2096 non-null   float64
 8   SH           2096 non-null   float64
 9   Land.2       2096 non-null   float64
 10  Ocean.2      2096 non-null   float64
 11  Trpcs        2096 non-null   float64
 12  Land.3       2096 non-null   float64
 13  Ocean.3      2096 non-null   float64
 14  NoExt        2096 non-null   float64
 15  Land.4       2096 non-null   float64
 16  Ocean.4      2096 non-null   float64
 17  SoExt        2096 non-null   float64
 18  Land.5       2096 non-null   float64
 19  Ocean.

In [6]:
data.head()

Unnamed: 0,Year,Mo,Globe,Land,Ocean,NH,Land.1,Ocean.1,SH,Land.2,...,NoPol,Land.6,Ocean.6,SoPol,Land.7,Ocean.7,USA48,USA49,AUST,Troposphere
0,1978,12,-0.48,-0.51,-0.47,-0.44,-0.46,-0.42,-0.52,-0.62,...,-0.39,-0.68,-0.06,-0.45,-0.38,-0.49,-1.29,-1.15,-1.29,Lower
1,1979,1,-0.47,-0.64,-0.41,-0.64,-0.86,-0.5,-0.31,-0.13,...,-0.46,-0.95,0.1,-0.16,-0.15,-0.16,-3.22,-2.42,0.92,Lower
2,1979,2,-0.43,-0.56,-0.39,-0.47,-0.57,-0.41,-0.39,-0.53,...,-2.01,-2.3,-1.66,-0.8,-1.25,-0.58,-1.76,-1.84,-0.3,Lower
3,1979,3,-0.38,-0.51,-0.33,-0.46,-0.51,-0.44,-0.3,-0.53,...,-0.56,-0.47,-0.65,-0.52,-1.25,-0.18,-0.7,-0.39,0.23,Lower
4,1979,4,-0.4,-0.57,-0.34,-0.47,-0.62,-0.37,-0.34,-0.46,...,-0.84,-0.81,-0.88,-0.26,0.26,-0.51,-0.72,-0.46,-1.12,Lower


In [7]:
# Standard Scaling the data except the last column

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])



  1.72487303]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])
  0.15139785]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])


## Split the Data

In [8]:
from sklearn.model_selection import train_test_split

# Separate features (X) and labels (y)
X = data.drop(columns=['Troposphere'])
y = data['Troposphere']  # 'Troposphere' is the target column

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Training set size:', X_train.shape)
print('Testing set size:', X_test.shape)

Training set size: (1676, 29)
Testing set size: (420, 29)


## Define the Distance Metric (Euclidean Distance)

In [9]:
import numpy as np

def euclidean_distance(point1, point2):
    # Ensure both points are numeric and align them by their indices
    point1 = point1.astype(np.float64)
    point2 = point2.astype(np.float64)
    return np.sqrt(np.sum((point1 - point2) ** 2))

## Implement the k-NN Algorithm

In [10]:
from collections import Counter

def knn_predict(X_train, y_train, test_point, k=3):
    distances = []

    # Make sure test_point is numeric
    test_point = test_point.astype(np.float64)

    # Calculate the distance from the test point to all training points
    for i in range(len(X_train)):
        train_point = X_train.iloc[i]

        # Calculate the Euclidean distance between the test point and the training point
        distance = euclidean_distance(train_point, test_point)
        distances.append((distance, y_train.iloc[i]))

    # Sort distances in ascending order and select the top k
    distances = sorted(distances)[:k]

    # Extract the labels of the k closest neighbors
    neighbors_labels = [label for _, label in distances]

    # Return the most common label among the neighbors
    most_common_label = Counter(neighbors_labels).most_common(1)[0][0]
    return most_common_label

## Make Predictions and Evaluate

In [11]:
# Set k for k-NN
k = 3

# Lists to store correct and incorrect predictions
correct_predictions = []
incorrect_predictions = []

# Iterate over each test point and make predictions
for i in range(len(X_test)):
    test_point = X_test.iloc[i]
    true_label = y_test.iloc[i]
    predicted_label = knn_predict(X_train, y_train, test_point, k=k)
    
    # Check if prediction is correct
    if predicted_label == true_label:
        correct_predictions.append((test_point.values, true_label, predicted_label))
    else:
        incorrect_predictions.append((test_point.values, true_label, predicted_label))

# Print correct predictions
print('\nCorrect Predictions:')
for test_point, true_label, predicted_label in correct_predictions:
    print(f'Test Point: {test_point}, True Label: {true_label}, Predicted Label: {predicted_label}')

# Print incorrect predictions
print('\nIncorrect Predictions:')
for test_point, true_label, predicted_label in incorrect_predictions:
    print(f'Test Point: {test_point}, True Label: {true_label}, Predicted Label: {predicted_label}')


Correct Predictions:
Test Point: [ 0.37657811  0.73046701  0.21868798  0.30678708  0.14964042  0.05281444
  0.22615161 -0.06572138  0.30638608  0.27490325  0.31082554  0.12777038
  0.13590801  0.10356504 -0.02345534  0.20107267 -0.25437218  0.35294224
  0.29935381  0.3195761  -0.35600431 -0.26485826 -0.42694427  0.21484671
  0.43936851  0.06251316  0.50475179  0.48349922  0.4003211 ], True Label: Mid, Predicted Label: Mid
Test Point: [ 0.45588957  1.30953617 -0.96300727 -1.12480004 -0.813154   -0.58022346
 -0.42311644 -0.61508137 -1.11274506 -1.99440934 -0.82311131  0.12777038
  0.11487403  0.12476144 -0.81639705 -0.49682949 -1.06090049 -1.46667881
 -2.39716599 -1.0551179  -0.51454323 -0.54841997 -0.45166304 -2.12080132
 -2.51138411 -1.98519279 -0.24536738 -1.01743184 -0.36280747], True Label: Tropopause, Predicted Label: Tropopause
Test Point: [ 1.40762717  0.73046701 -0.45200392 -0.31446771 -0.46012937  0.08033782
 -0.1105059   0.20895861 -0.85730146 -0.55787201 -0.91033722  0.29786

In [13]:
# check the accuracy of the model
accuracy = len(correct_predictions) / len(X_test)
print(f'\nAccuracy: {accuracy:.2f}')


Accuracy: 0.42


In [12]:
! pandoc kNN.ipynb -o kNN_12212070.docx