In [43]:
import pandas as pd
import numpy as np

# List1
Name = [2,3,4,6,7,8]

# List2
Age = [3,3,4,7,8]

labels = [1,1,1,2,2]

# get the list of tuples from two lists.
# and merge them by using zip().
list_of_tuples = list(zip(Name, Age,labels))

# Assign data to tuples.
list_of_tuples

df = pd.DataFrame(list_of_tuples,
                  columns=['X1', 'X2','Labels'])
# Print data.
print(df)



   X1  X2  Labels
0   2   3       1
1   3   3       1
2   4   4       1
3   6   7       2
4   7   8       2


In [44]:

X = df.iloc[:, :-1]  # Features (all columns except the last)
y = df.iloc[:, -1]   # Target (the last column)
print(X)
print(y)

   X1  X2
0   2   3
1   3   3
2   4   4
3   6   7
4   7   8
0    1
1    1
2    1
3    2
4    2
Name: Labels, dtype: int64


In [45]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Normalize features to ensure equal weight in distance calculation
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_test)

1    1
Name: Labels, dtype: int64


In [46]:
# import numpy as np

# def euclidean_distance(x1, x2):
#     # Calculate the Euclidean distance between two points
#     return np.sqrt(np.sum((x1 - x2) ** 2))

# def most_common_label(arr):
#     # Boyer-Moore Voting Algorithm for finding the most common label
#     vote = 0
#     candidate = None
    
#     for i in range(len(arr)):
#         if vote == 0:
#             candidate = arr[i]
#             vote = 1
#         else:
#             if arr[i] == candidate:
#                 vote += 1
#             else:
#                 vote -= 1
#     return candidate

# def weights(distance):
#     # Avoid division by zero by adding a small epsilon
#     return 1 / (distance + 1e-5)

# def knn_algorithm(X_train, y_train, X_test, k):
#     distances = []

#     # Step 1: Calculate the distance between X_test and each X_train
#     for i in range(len(X_train)):
#         distance = euclidean_distance(X_train[i], X_test)
#         distances.append((distance, y_train.iloc[i]))
    
#     print("Euclidean distances are:")
#     print(distances)
    
#     # Step 2: Sort distances by the first element (distance)
#     distances.sort(key=lambda x: x[0])
    
#     # Step 3: Select the k-nearest neighbors
#     k_nearest_neighbors = distances[:k]
    
#     print("K-nearest neighbors:")
#     print(k_nearest_neighbors)
    
#     # Step 4: Extract the labels of the k-nearest neighbors
#     K_nearest_label = [label for _, label in k_nearest_neighbors]
    
#     print("K-nearest labels:")
#     print(K_nearest_label)
    
#     # Step 5: Use the Boyer-Moore Voting Algorithm to find the most common label
#     most_common_label_result = most_common_label(K_nearest_label)
    
#     print("Most common label (Boyer-Moore):")
#     print(most_common_label_result)
    
#     return most_common_label_result


In [47]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    # Calculate the Euclidean distance between two points
    return np.sqrt(np.sum((x1 - x2) ** 2))

def weights(distance):
    # Avoid division by zero by adding a small epsilon
    return 1 / (distance + 1e-5)

def weighted_knn_algorithm(X_train, y_train, X_test, k):
    distances = []

    # Step 1: Calculate the distance between X_test and each X_train
    for i in range(len(X_train)):
        distance = euclidean_distance(X_train[i], X_test)
        distances.append((distance, y_train.iloc[i]))
    
    print("Euclidean distances are:")
    print(distances)
    
    # Step 2: Sort distances by the first element (distance)
    distances.sort(key=lambda x: x[0])
    
    # Step 3: Select the k-nearest neighbors
    k_nearest_neighbors = distances[:k]
    
    # Step 4: Compute weights for the k-nearest neighbors
    weights_list = [weights(dist[0]) for dist in k_nearest_neighbors]
    
    print("K-nearest neighbors with weights:")
    print(k_nearest_neighbors)
    
    # Step 5: Compute weighted votes for each label
    weighted_votes = Counter()

    for i, (distance, label) in enumerate(k_nearest_neighbors):
        weighted_votes[label] += weights_list[i]
    
    print("Weighted votes:")
    print(weighted_votes)
    
    # Step 6: Find the label with the highest weighted vote
    most_common_label_result = weighted_votes.most_common(1)[0][0]
    
    return most_common_label_result


In [48]:
correct_predictions = 0

for i in range(len(X_test)):
    predicted_label = weighted_knn_algorithm(X_train, y_train, X_test[i], 3)
    print(predicted_label)
    actual_label = y_test.iloc[i]
    
    if predicted_label == actual_label:
        correct_predictions += 1


Euclidean distances are:
[(3.2144954460250608, 2), (0.7197016060085817, 1), (0.539163866017192, 1), (2.500528485366859, 2)]
K-nearest neighbors with weights:
[(0.539163866017192, 1), (0.7197016060085817, 1), (2.500528485366859, 2)]
Weighted votes:
Counter({1: 3.2441347271158483, 2: 0.39991386089516157})
1


In [50]:
print("number of correct predictions {}".format(correct_predictions))
print("number of total predictions {}".format(len(X_test)))

# Calculate and print the accuracy
accuracy = correct_predictions / len(X_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

number of correct predictions 1
number of total predictions 1
Accuracy: 100.00%


In Step 5, you are computing weighted votes for each label based on the inverse of the distance (weight). The label that has the highest sum of weights will be selected as the predicted label.

Let's break this down with an example:

Example Scenario:
Suppose we have the following scenario:

Test point: X_test
Training set: X_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
Labels: y_train = [0, 1, 1, 0]
Number of neighbors (k): 3
After computing distances, the nearest neighbors and their labels are:
Neighbor 1: Distance = 0.2, Label = 1
Neighbor 2: Distance = 0.5, Label = 0
Neighbor 3: Distance = 0.8, Label = 1
Step 5: Compute Weighted Votes
Weights: You compute the weight for each neighbor as 1 / distance:

Weight for Neighbor 1 = 1 / 0.2 = 5
Weight for Neighbor 2 = 1 / 0.5 = 2
Weight for Neighbor 3 = 1 / 0.8 = 1.25
Weighted Votes Calculation: Now, we accumulate the weights for each label:

python
Copy code
weighted_votes = Counter()

# For Neighbor 1 (Label = 1):
weighted_votes[1] += 5  # So, weighted_votes = {1: 5}

# For Neighbor 2 (Label = 0):
weighted_votes[0] += 2  # Now, weighted_votes = {1: 5, 0: 2}

# For Neighbor 3 (Label = 1):
weighted_votes[1] += 1.25  # Now, weighted_votes = {1: 6.25, 0: 2}
Now we have the weighted votes:

Label 1 has a total weight of 6.25 (from Neighbor 1 and Neighbor 3).
Label 0 has a total weight of 2 (from Neighbor 2).
Step 6: Find the Label with the Highest Weighted Vote
After calculating the total weighted votes, you can now choose the label that has the highest weight sum:

In this case, Label 1 has the highest weight (6.25), so it will be the predicted label.
In code:

python
Copy code
most_common_label_result = weighted_votes.most_common(1)[0][0]  # Returns label 1
So, the predicted label for X_test will be 1, because it received the highest sum of weighted votes.

Summary of Steps:
Compute the weights for each neighbor (using 1/distance).
Accumulate the weights for each label (so labels with closer neighbors get higher weight).
Return the label with the highest total weight as the prediction.
This method ensures that closer neighbors have a larger influence on the final prediction compared to farther neighbors, making the voting process weighted instead of simple majority voting.