In [32]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import sys

K = 9
z_score = False
np.set_printoptions(suppress=True)

# Get rid of Year, Month, Day columns
# Find a way to add the columns to a list and assign numbers to them
# No need to z-score because data is categorical
# Remember to shuffle and get 2/3 for training and 1/3 for validation
# Get rid of columns that have empty cells OFFSET TO PROTECT PRIVACY

In [33]:
np.set_printoptions(suppress=True)
# np.set_printoptions(threshold=np.inf)
# np.set_printoptions(linewidth=np.inf)

data_matrix = np.genfromtxt('crime_shortened.csv', delimiter=',', dtype=str)
# data_matrix = np.genfromtxt('crime.csv', delimiter=',', dtype=str)

# Removes first row
data_matrix = np.delete(data_matrix, (0), axis=0)

# Removes Year, Month, Day columns
data_matrix = np.delete(data_matrix, (1, 2, 3), axis=1)

type = []
hundred_block = []
neighborhood = []


for row in range(0, len(data_matrix)):
    for col in range(0, len(data_matrix[0])):
        if data_matrix[row][0] not in type:
            type.append(data_matrix[row][0])

        if data_matrix[row][3] not in hundred_block:
            hundred_block.append(data_matrix[row][3])

        if data_matrix[row][4] not in neighborhood:
            neighborhood.append(data_matrix[row][4])

for row in range(0, len(data_matrix)):
    for col in range(0, len(data_matrix[0])):
        current = data_matrix[row][col]

        if data_matrix[row][col] in type:
            data_matrix[row][col] = type.index(current)
        
        elif data_matrix[row][col] in hundred_block:
            data_matrix[row][col] = hundred_block.index(current)

        elif data_matrix[row][col] in neighborhood:
            data_matrix[row][col] = neighborhood.index(current)
        
        
        if data_matrix[row][col] == '0':
            data_matrix[row][col] = '0.0000001'


data_matrix = data_matrix.astype(float)
print(data_matrix)

# Shuffles observations
np.random.shuffle(data_matrix)

[[      0.0000001       17.               0.0000001  ... 5455338.57
       49.2507732     -123.101053  ]
 [      1.              23.              16.         ... 5458710.94
       49.28113596    -123.0618933 ]
 [      2.              10.              50.         ... 5458476.5
       49.27897786    -123.1233611 ]
 ...
 [      3.              11.               0.0000001  ... 5459036.23
       49.28401453    -123.1216657 ]
 [      5.               0.0000001        0.0000001  ... 5458940.06
       49.28312638    -123.1419082 ]
 [      7.              14.              14.         ...       0.0000001
        0.0000001        0.0000001 ]]


In [34]:
threshold = round(2/3 * len(data_matrix))

# Training gets first 2/3
X_train = data_matrix[0:threshold, :]
#Validation gets remaining
X_valid = data_matrix[threshold:len(data_matrix), :]

print(np.shape(X_train))
print(np.shape(X_valid))

# Y data is removed from the last column
Y_train = X_train[:, 0]
Y_valid = X_valid[:, 0]

X_train = np.delete(X_train, (0), axis=1)
X_valid = np.delete(X_valid, (0), axis=1)

(3333, 9)
(1666, 9)


In [35]:
def myKNN(my_X_train, my_X_valid, my_Y_train, my_Y_valid):
    my_Y_hat = []
    for valid in my_X_valid:
        
        neighbors = []
        neighbors_index = []

        for train in my_X_train:
            # Computes distance for each feature
            subtracted_arr = np.absolute(valid - train)
            # Computes total distance
            distance = np.sum(subtracted_arr)
            # Adds this distance to an array
            neighbors = np.append(neighbors, distance)

        # Gets index of K smallest numbers
        neighbors_index = np.argsort(neighbors)[:K]
        

        # Gets most consistent Y value (while converting float to int)
        temp = []
        for n in neighbors_index:
            temp = np.append(temp, my_Y_train[n])
        
        temp = temp.astype(int)
        counts = np.bincount(temp)
        most_common = np.argmax(counts)
        my_Y_hat = np.append(my_Y_hat, most_common)

    return my_Y_hat

In [36]:
# Prints validation accuracy
score = 0

Y_hat = myKNN(X_train, X_valid, Y_train, Y_valid)

for i in range(0, Y_valid.size):
# If it's equal to X_train, adds score
    if Y_hat[i] == Y_valid[i]:
        score += 1

# Finds accuracy and print
accuracy = score / Y_valid.size * 100
print("The validation accuracy is %.2f percent." %(accuracy))

The validation accuracy is 43.52 percent.


In [37]:
# Prints confusion matrix
confusion_matrix = np.zeros((np.size(Y_hat), np.size(Y_hat)))

Y_valid = Y_valid.astype(int)
Y_hat = Y_hat.astype(int)
confusion_matrix = confusion_matrix.astype(int)

for i in range(0, Y_valid.size):
    h = Y_hat[i] - 1
    t = Y_valid[i] - 1

    confusion_matrix[t][h] += 1

print(confusion_matrix)

[[105   2  28 ...   0   0  14]
 [ 11  38  95 ...   0   0   9]
 [ 43  69 385 ...   0   0  19]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [ 13   7  63 ...   0   0  11]]
