In [3]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import datetime
import sqlite3
import sys
import traceback
import numpy as np
import Data.database_handler as dbHandler
from torchvision import transforms, utils
import datetime as dt
sys.path.append('..')
#%run Map_grid/map.ipynb import CalculateGrid

#Connecting to the SQLite database
dataAmount = 1700000
dbPath = r'Data\datasetNY.db'
gridSize = 5
chunkAmount = 17000
chunkSize = dataAmount / chunkAmount
data = dbHandler.get_n_data_datetime_converted(dbPath, dataAmount)

class AccidentDataset(Dataset):
    def __init__(self, transform=None):
        self.coordinates = data
        self.coordinates = pd.DataFrame(self.coordinates, columns=['datetime', 'latitude', 'longitude'])
        
        #split into 500 chunks using numpy
        self.coordinates = np.array_split(self.coordinates, chunkAmount)

        #process each chunk and merge it back into one dataframe
        self.grids = []
        grid_lower_lat, grid_lower_long = 40.54, -74.15
        grid_upper_lat, grid_upper_long = 40.91, -73.70
        grid_lat_step = (grid_upper_lat - grid_lower_lat) / gridSize
        grid_long_step = (grid_upper_long - grid_lower_long) / gridSize
        for i in range(len(self.coordinates)-1):
            grid = np.zeros((gridSize, gridSize))
            for index, row in self.coordinates[i].iterrows():
                coordinates = row['latitude'], row['longitude']
                for j in range(gridSize):
                    for k in range(gridSize):
                        lat_lower = grid_lower_lat + j * grid_lat_step
                        lat_upper = grid_lower_lat + (j + 1) * grid_lat_step
                        long_lower = grid_lower_long + k * grid_long_step
                        long_upper = grid_lower_long + (k + 1) * grid_long_step
                        if lat_lower <= float(coordinates[0]) < lat_upper and long_lower <= float(coordinates[1]) < long_upper:
                            grid[j][k] += 1
                            break
            self.grids.append(grid/chunkSize)
        self.grids = np.array(self.grids)
        self.transform = transform      

    def __len__(self):
        return len(self.grids)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        grid = self.grids[idx]
        grid = torch.from_numpy(grid).float()

        max_index = np.argmax(grid)
        max_index = np.array(max_index)
        #get indicies of the highest value and nearest neighbours that are within 10% of the highest value
        Jonasu_wa_kanji_o_motte_imasu = max_index.item() // gridSize, max_index.item() % gridSize
        max_value = grid[Jonasu_wa_kanji_o_motte_imasu[0]][Jonasu_wa_kanji_o_motte_imasu[1]].item()
        amount = 0
        indicies = np.empty((0,2), int)
        for i in range(gridSize):
            for j in range(gridSize):
                if grid[i][j].item() >= max_value * 0.9:
                    indicies = np.append(indicies, i * gridSize + j)
                    amount += 1

        indicies = torch.from_numpy(indicies).reshape(-1, 1).flatten().long()
        indicies = torch.nn.functional.pad(indicies, (0, gridSize - len(indicies)), value=0)
        #get amount of indicies as a tensor
        indicies_amount = torch.tensor(amount-1).long()
        return grid, indicies_amount

accident_dataset = AccidentDataset()

[1468929000.0, '40.75126', '-73.94227']


In [8]:

#Create new array with 60% of the data
train_size = int(0.6 * len(accident_dataset))
test_size = len(accident_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(accident_dataset, [train_size, test_size])

print(len(train_dataset))
print(len(test_dataset))
print(len(accident_dataset))

#Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=gridSize*gridSize, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=gridSize*gridSize, shuffle=False)

# define the class for multilinear regression
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(gridSize**2, 20),
            nn.ReLU(),
            nn.Linear(20, 15),
            nn.ReLU(),
            nn.Linear(15, gridSize*2),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


# define the class for multilinear regression
# building the model object
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f'Using {device} device')

model = Network().to(device)
if os.path.exists("model.pth"):
    model.load_state_dict(torch.load("model.pth"))
    print("Loaded model from model.pth")
else:
    print("No model found, creating new model")

# define the loss function
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# define the training loop
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    print(size)
    model.train()
    print("Training model")
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        #print(X)
        #print(y)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    print("Finished training model")

def test(dataloader, model, loss_fn):
    print("Testing model")
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            print('y ', y)
            print('predition', pred.argmax(1))
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            #print(correct)
    test_loss /= num_batches
    print(f"correct: {correct}  size: {size}  correct/size: {correct/size}")
    correct /= size
    print(f"Test Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

10199
6800
16999
Using cpu device
Loaded model from model.pth
Epoch 1
-------------------------------
10199
Training model
loss: 1.860920  [   25/10199]
loss: 1.668528  [ 2525/10199]
loss: 1.607720  [ 5025/10199]
loss: 1.742442  [ 7525/10199]
loss: 1.582489  [10025/10199]
Finished training model
Testing model
y  tensor([0, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 2, 3, 0, 0, 2, 0,
        0])
predition tensor([[ 1.0498,  0.2125, -0.1544, -0.0358, -0.1729, -0.2139, -0.3349, -0.3407,
         -0.4455, -0.3506],
        [ 1.0494,  0.2117, -0.1529, -0.0381, -0.1737, -0.2133, -0.3338, -0.3402,
         -0.4449, -0.3523],
        [ 1.0499,  0.2116, -0.1529, -0.0382, -0.1731, -0.2145, -0.3337, -0.3404,
         -0.4456, -0.3520],
        [ 1.0495,  0.2112, -0.1530, -0.0383, -0.1743, -0.2130, -0.3333, -0.3395,
         -0.4446, -0.3531],
        [ 1.0499,  0.2131, -0.1564, -0.0329, -0.1722, -0.2139, -0.3365, -0.3407,
         -0.4462, -0.3492],
        [ 1.0497,  0.2116, -0.1534, -0.