<a href="https://colab.research.google.com/github/Akanksha200008/Advance-Mathematical-Statistics/blob/main/Project5__k_nearest_neighbors_classification_to_classify_binary_events.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from datetime import datetime
from scipy.spatial import distance

# Load data
data_url = "https://mth522.wordpress.com/wp-content/uploads/2024/03/us-violence-brief-1.xls"
violence_data = pd.read_excel(data_url)

# Combine 'year', 'month', 'day' columns to create a 'date_recorded' column
violence_data['date_recorded'] = violence_data.apply(lambda row: datetime(int(row['year']), int(row['month']), int(row['day'])), axis=1)
violence_data.drop(columns=['year', 'month', 'day'], inplace=True)

# Scale 'latitude' and 'longitude' features using MinMaxScaler
location_scaler = MinMaxScaler()
violence_data[['latitude', 'longitude']] = location_scaler.fit_transform(violence_data[['latitude', 'longitude']])

# Convert 'date_recorded' to timestamp in seconds
violence_data['time_in_seconds'] = violence_data['date_recorded'].apply(lambda x: int(x.timestamp()))

# Split data into training and testing sets based on date
training_data = violence_data[violence_data['date_recorded'] < '2023-01-01']
testing_data = violence_data[violence_data['date_recorded'] >= '2023-01-01']

testing_data.loc[:, 'fatalities'] = testing_data['fatalities'].astype(int)

# Define Euclidean distance function using scipy
def calculate_distance(point1, point2):
    coordinates1 = np.array([point1['latitude'], point1['longitude'], point1['time_in_seconds']])
    coordinates2 = np.array([point2['latitude'], point2['longitude'], point2['time_in_seconds']])
    return distance.euclidean(coordinates1, coordinates2)

# Implement custom K-Nearest Neighbors function
def knn_classifier(train_set, test_set, neighbors_count):
    predictions_list = []

    # Iterate over each test data point
    for i in range(len(test_set)):
        # Calculate all distances between the test point and each training point
        distance_values = np.array([
            calculate_distance(test_set.iloc[i], train_set.iloc[j]) for j in range(len(train_set))
        ])
        nearest_indices = np.argsort(distance_values)[:neighbors_count]
        nearest_labels = train_set.iloc[nearest_indices, 0]
        unique_labels, label_counts = np.unique(nearest_labels, return_counts=True)

        # Predict the most frequent class among the neighbors
        predicted_label = unique_labels[np.argmax(label_counts)]
        predictions_list.append(predicted_label)

    return predictions_list

k_options = [1, 3, 5, 7, 9, 11]

# Evaluate KNN for each k value in k_options
for neighbors_count in k_options:
    predicted_values = knn_classifier(training_data, testing_data, neighbors_count)
    predicted_values = [int(pred) for pred in predicted_values]
    model_accuracy = accuracy_score(testing_data['fatalities'], predicted_values)
    print(f"Model accuracy for k={neighbors_count}: {model_accuracy:}")

    # Calculate the fraction of correctly classified data points in 2023
    correct_classification_fraction = np.mean(np.array(predicted_values) == testing_data['fatalities'].values)
    print(f"Fraction of correctly classified data points for k={neighbors_count}: {correct_classification_fraction}")


Model accuracy for k=1: 0.8341013824884793
Fraction of correctly classified data points for k=1: 0.8341013824884793
Model accuracy for k=3: 0.8341013824884793
Fraction of correctly classified data points for k=3: 0.8341013824884793
Model accuracy for k=5: 0.8341013824884793
Fraction of correctly classified data points for k=5: 0.8341013824884793
Model accuracy for k=7: 0.8341013824884793
Fraction of correctly classified data points for k=7: 0.8341013824884793
Model accuracy for k=9: 0.8341013824884793
Fraction of correctly classified data points for k=9: 0.8341013824884793
Model accuracy for k=11: 0.8341013824884793
Fraction of correctly classified data points for k=11: 0.8341013824884793
