In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load data from Excel file
file_path = r"C:\Users\ashwi\Downloads\us-violence-brief-1.xls"
data = pd.read_excel(file_path)

# Preprocessing data
# Convert dates to datetime and drop original date columns
data['date'] = pd.to_datetime(data[['year', 'month', 'day']])
data.drop(['year', 'month', 'day'], axis=1, inplace=True)

# Scale latitude and longitude
scaler = StandardScaler()
data[['latitude', 'longitude']] = scaler.fit_transform(data[['latitude', 'longitude']])

# Convert datetime to timestamp (int64) in seconds
data['timestamp'] = data['date'].astype('int64') // 10**9  

# Split data into training and testing sets based on year
train_data = data[data['date'].dt.year < 2023]
test_data = data[data['date'].dt.year == 2023]

# Define distance function (Euclidean distance)
def euclidean_distance(x1, x2):
    coords_1 = np.array([x1['latitude'], x1['longitude'], x1['timestamp']])
    coords_2 = np.array([x2['latitude'], x2['longitude'], x2['timestamp']])
    return np.sqrt(np.sum((coords_1 - coords_2) ** 2))

# Implement KNN algorithm
def knn(train_data, test_data, k):
    predictions = []
    for _, test_row in test_data.iterrows():
        # Calculate distances to each training point
        distances = [
            (euclidean_distance(test_row, train_row), train_row['fatalities'])
            for _, train_row in train_data.iterrows()
        ]
        
        # Get the k nearest neighbors
        neighbors = sorted(distances, key=lambda x: x[0])[:k]
        
        # Count occurrences in the k nearest neighbors and predict the most common class
        counts = np.bincount([int(neighbor[1]) for neighbor in neighbors], minlength=2)
        prediction = np.argmax(counts)
        predictions.append(prediction)
    return predictions

# Choose odd values of k for KNN
k_values = [1, 3, 5, 7, 9]

# Evaluate KNN for each k value
for k in k_values:
    predictions = knn(train_data, test_data, k)
    accuracy = accuracy_score(test_data['fatalities'], predictions)
    print(f"Accuracy for k={k}: {accuracy:}")
    
    # Calculate fraction of correctly classified 2023 locations
    correctly_classified = sum(pred == true for pred, true in zip(predictions, test_data['fatalities']))
    fraction_correctly_classified = correctly_classified / len(test_data)
    print(f"Fraction of correctly classified 2023 locations for k={k}: {fraction_correctly_classified:}")


Accuracy for k=1: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=1: 0.8341013824884793
Accuracy for k=3: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=3: 0.8341013824884793
Accuracy for k=5: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=5: 0.8341013824884793
Accuracy for k=7: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=7: 0.8341013824884793
Accuracy for k=9: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=9: 0.8341013824884793
