In [21]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import datetime
import sqlite3
import sys
import traceback
import numpy as np
import Data.database_handler as dbHandler
from torchvision import transforms, utils
import datetime as dt
sys.path.append('..')
#%run Map_grid/map.ipynb import CalculateGrid

#Connecting to the SQLite database
dataAmount = 1500000
dbPath = r'Data\datasetNY.db'
gridSize = 5
chunkAmount = 225555
chunkSize = dataAmount / chunkAmount
data = dbHandler.get_n_data_datetime_converted(dbPath, dataAmount)

class AccidentDataset(Dataset):
    def __init__(self, transform=None):
        self.coordinates = data
        self.coordinates = pd.DataFrame(self.coordinates, columns=['datetime', 'latitude', 'longitude'])
        
        #split into 500 chunks using numpy
        self.coordinates = np.array_split(self.coordinates, chunkAmount)

        #process each chunk and merge it back into one dataframe
        self.grids = []
        grid_lower_lat, grid_lower_long = 40.54, -74.15
        grid_upper_lat, grid_upper_long = 40.91, -73.70
        grid_lat_step = (grid_upper_lat - grid_lower_lat) / gridSize
        grid_long_step = (grid_upper_long - grid_lower_long) / gridSize
        for i in range(len(self.coordinates)-1):
            grid = np.zeros((gridSize, gridSize))
            for index, row in self.coordinates[i].iterrows():
                coordinates = row['latitude'], row['longitude']
                for j in range(gridSize):
                    for k in range(gridSize):
                        lat_lower = grid_lower_lat + j * grid_lat_step
                        lat_upper = grid_lower_lat + (j + 1) * grid_lat_step
                        long_lower = grid_lower_long + k * grid_long_step
                        long_upper = grid_lower_long + (k + 1) * grid_long_step
                        if lat_lower <= float(coordinates[0]) < lat_upper and long_lower <= float(coordinates[1]) < long_upper:
                            grid[j][k] += 1
                            break
            self.grids.append(grid/chunkSize)
        self.grids = np.array(self.grids)
        self.transform = transform      

    def __len__(self):
        return len(self.grids)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        grid = self.grids[idx]
        grid = torch.from_numpy(grid).float()

        max_index = np.argmax(grid)
        max_index = np.array(max_index)
        #get indicies of the highest value and nearest neighbours that are within 10% of the highest value
        x_and_y = max_index.item() // gridSize, max_index.item() % gridSize
        max_value = grid[x_and_y[0]][x_and_y[1]].item()
        amount = 0
        indicies = np.empty((0,2), int)
        for i in range(gridSize):
            for j in range(gridSize):
                if grid[i][j].item() >= max_value * 0.9:
                    indicies = np.append(indicies, i * gridSize + j)
                    amount += 1

        indicies = torch.from_numpy(indicies).reshape(-1, 1).flatten().long()
        indicies = torch.nn.functional.pad(indicies, (0, gridSize - len(indicies)), value=0)
        #get amount of indicies as a tensor
        indicies_amount = torch.tensor(amount-1).long()
        return grid.flatten(), torch.tensor(max_index.item()).long()

accident_dataset = AccidentDataset()

[1383554700.0, '40.8262031', '-73.8572259']


In [26]:

#Create new array with 60% of the data
train_size = int(0.6 * len(accident_dataset))
test_size = len(accident_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(accident_dataset, [train_size, test_size])

print(len(train_dataset))
print(len(test_dataset))
print(len(accident_dataset))

#Create dataloader
train_dataloader = DataLoader(train_dataset, 64)
test_dataloader = DataLoader(test_dataset, 64)

# define the class for multilinear regression
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.2)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(gridSize ** 2, 25),
            nn.ReLU(),
            nn.Linear(25, 25),
            nn.ReLU(),
            nn.Linear(25, gridSize ** 2),
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


# define the class for multilinear regression
# building the model object
#device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
device = torch.device('cpu')
print(f'Using {device} device')

model = Network().to(device)
if os.path.exists("model.pth"):
    model.load_state_dict(torch.load("model.pth"))
    print("Loaded model from model.pth")
else:
    print("No model found, creating new model")

# define the loss function
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# define the training loop
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    print(size)
    model.train()
    print("Training model")
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        #print('X ', X)
        #print('y ', y)
        #print (X.shape)
        pred = model(X)
        #print('pred ', pred)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    print("Finished training model")

def test(dataloader, model, loss_fn):
    print("Testing model")
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, alsocorrect = 0, 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            #print('y ', y)
            #print('predition', pred.argmax(1))

            #check if prediction is correct
            predictions = torch.topk(pred, 5, dim=1).indices
            #is_correct = (pred.argmax(1) == y or pred.argmax(1) == max_value)

            for i in range (len(predictions)):
                if y[i] in predictions[i]:
                    if y[i] == pred.argmax(1)[i]:
                        correct += 1
                    else:
                        alsocorrect += 1

            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            #print(correct)
    test_loss /= num_batches
    print(f"Main correct: {correct}  size: {size}  main correct/size: {correct/size}")
    print(f"Also correct: {alsocorrect}  size: {size}  also correct/size: {alsocorrect/size}")
    correct += alsocorrect
    correct /= size
    print(f"Test Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")

epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

model.eval()
x, y = test_dataset[0][0], test_dataset[0][1]
#x = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
#x = torch.tensor(x).float()
#y = 0
with torch.no_grad():
    pred = model(x.to(device))
    print(pred)
    predicted, actual = pred.topk(gridSize), y
    max_value = pred.max(0)[0]
    indekusu = []
    for i in range(len(predicted)):
        if predicted.values[i].item() >= 0.8 * max_value:
            indekusu.append(predicted.indices[i].item())
    print(f'Predicted: "{indekusu}", Actual: "{actual}"')

135332
90222
225554
Using cpu device
Loaded model from model.pth
Epoch 1
-------------------------------
135332
Training model
loss: 1.325452  [   64/135332]
loss: 1.025100  [ 6464/135332]
loss: 1.270385  [12864/135332]
loss: 1.182342  [19264/135332]
loss: 1.137167  [25664/135332]
loss: 1.296766  [32064/135332]
loss: 1.098794  [38464/135332]
loss: 1.272051  [44864/135332]
loss: 1.031183  [51264/135332]
loss: 1.146062  [57664/135332]
loss: 1.523631  [64064/135332]
loss: 1.222232  [70464/135332]
loss: 1.287904  [76864/135332]
loss: 0.849469  [83264/135332]
loss: 1.245970  [89664/135332]
loss: 1.312216  [96064/135332]
loss: 1.142385  [102464/135332]
loss: 1.077500  [108864/135332]
loss: 1.104374  [115264/135332]
loss: 0.974218  [121664/135332]
loss: 1.147626  [128064/135332]
loss: 1.412171  [134464/135332]
Finished training model
Testing model
Main correct: 60185  size: 90222  main correct/size: 0.6670767661989315
Also correct: 15996  size: 90222  also correct/size: 0.1772960031921261
Tes