In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Define Euclidean distance function
def euclidean_distance(row1, row2):
    vector1 = np.array([row1['latitude'], row1['longitude'], row1['timestamp']])
    vector2 = np.array([row2['latitude'], row2['longitude'], row2['timestamp']])
    return np.linalg.norm(vector1 - vector2)

# Split data into training and testing sets based on the year
def split_data(data):
    if 'date' not in data.columns:
        raise KeyError("The 'date' column is missing from the dataset.")
    train_data = data[data['date'].dt.year < 2023]
    test_data = data[data['date'].dt.year == 2023]
    return train_data, test_data

# Implement KNN algorithm
def knn_predict(train_data, test_instance, k):
    if k <= 0:
        raise ValueError("k must be a positive integer.")
    distances = [
        (euclidean_distance(test_instance, train_row), train_row['fatalities'])
        for _, train_row in train_data.iterrows()
    ]
    distances.sort(key=lambda x: x[0])
    k_neighbors = distances[:k]
    counts = np.zeros(2)
    for _, label in k_neighbors:
        counts[int(label)] += 1
    return np.argmax(counts)

# Load data from Excel file
def load_data(file_path):
    try:
        data = pd.read_excel(file_path)
        return data
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found at the path: {file_path}")
    except Exception as e:
        raise Exception(f"An error occurred while loading the file: {e}")

# Evaluate KNN for multiple k values
def evaluate_knn(train_data, test_data, k_values):
    results = {}
    for k in k_values:
        try:
            predictions = [knn_predict(train_data, test_row, k) for _, test_row in test_data.iterrows()]
            accuracy = accuracy_score(test_data['fatalities'], predictions)
            fraction_correctly_classified = sum(
                predictions == test_data['fatalities']) / len(test_data)
            results[k] = (accuracy, fraction_correctly_classified)
            print(f"Accuracy for k={k}: {accuracy:.2f}")
            print(f"Fraction of correctly classified 2023 locations for k={k}: {fraction_correctly_classified:.2f}")
        except Exception as e:
            print(f"An error occurred during KNN evaluation for k={k}: {e}")
    return results

# Preprocess data
def preprocess_data(data):
    try:
        if 'year' not in data.columns or 'month' not in data.columns or 'day' not in data.columns:
            raise KeyError("Columns 'year', 'month', and 'day' are required for preprocessing.")
        
        data['date'] = pd.to_datetime(data[['year', 'month', 'day']])
        data.drop(['year', 'month', 'day'], axis=1, inplace=True)
        scaler = StandardScaler()
        if 'latitude' in data.columns and 'longitude' in data.columns:
            data[['latitude', 'longitude']] = scaler.fit_transform(data[['latitude', 'longitude']])
        else:
            raise KeyError("Columns 'latitude' and 'longitude' are missing.")
        data['timestamp'] = data['date'].astype('int64') // 10**9
        return data
    except KeyError as e:
        raise KeyError(f"Preprocessing failed due to missing column: {e}")
    except Exception as e:
        raise Exception(f"An error occurred during preprocessing: {e}")

# Main program
if __name__ == "__main__":
    # File path to Excel file
    file_path = r"/Users/charishyadavali/Downloads/us-violence-brief-1.xls"
    
    try:
        # Load and preprocess data
        raw_data = load_data(file_path)
        data = preprocess_data(raw_data)
        
        # Split data
        train_data, test_data = split_data(data)
        
        # K values for evaluation
        k_values = [1, 3, 5, 7, 9]
        
        # Evaluate and display results
        results = evaluate_knn(train_data, test_data, k_values)
    except Exception as e:
        print(f"An error occurred in the main program: {e}")


Accuracy for k=1: 0.83
Fraction of correctly classified 2023 locations for k=1: 0.83
Accuracy for k=3: 0.83
Fraction of correctly classified 2023 locations for k=3: 0.83
Accuracy for k=5: 0.83
Fraction of correctly classified 2023 locations for k=5: 0.83
Accuracy for k=7: 0.83
Fraction of correctly classified 2023 locations for k=7: 0.83
Accuracy for k=9: 0.83
Fraction of correctly classified 2023 locations for k=9: 0.83
