In [2]:
"""
Notebook that takes training and test data and runs the Naive Bayes Classifier to label test data (determine which road
the car is on)
"""

'\nNotebook that takes training and test data and runs the Naive Bayes Classifier to label test data (determine which road\nthe car is on)\n'

In [3]:
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
import numpy as np
import pandas
import dataReader
import csv_parser
from road_features_new import addDistanceFeatures
from road_features_new import coordDistance
from road_features_new import addRegionFeatures

import folium
from folium.plugins import HeatMap

In [4]:
"""
import required data & set coordinate file names & parameters
"""
df = dataReader.getData(path="Bristol 3rd year speed limit data.tsv", speedInKmh = False)[:100]

A1_width = 14
A6055_width = 8

A1_north_file = "A1_northbound_coordinates.csv"
A1_south_file = "A1_southbound_coordinates.csv"
A6055_file = "A6055_coordinates.csv"

In [5]:
"""
Split's pandas dataframe speeding data from df and create's training and testing numpy arrays. Label's training data as on
the M1 south, M1 north or A6055
here training data will be some percentage of the dataset randomly selected

param: df_name = name of csv speeding data for labelling
param: training_labels = name of .txt for labelled data (rows are ids and assigned label)
"""
def NBGaussianClassifier(showaccuracy=False, df_name="Bristol 3rd year speed limit data.tsv", training_labels="ManuallyLabelledSpeedAlertIdsCharlieWithA6055BiasPoints.txt",A1_north_file="A1_northbound_coordinates.csv", A1_south_file="A1_southbound_coordinates.csv", A6055_file="A6055_coordinates.csv"):

    #extract data from manual data collection file
    fileLabels = open(training_labels,"r").readlines()
    labels = []
    labelledIds = []
    for row in fileLabels:
        newrow = row.strip().split("\t")
        labelledIds.append(int(newrow[0]))
        labels.append(int(newrow[1][0]))


    #generate training data features
    #for showing labelled data with labels
    df_training = dataReader.getData(path="Bristol 3rd year speed limit data.tsv", speedAlertIds=labelledIds, speedInKmh = False)

    
    IDlabelmap = dict(zip(labelledIds,labels))
    training_labels = np.array([IDlabelmap[id] for id in df_training['SpeedAlertsId']])
    print("afterIDlabelmap" + str(training_labels))
    print("length: " + str(len(training_labels)))
    print(training_labels)

    df_training_dist = addDistanceFeatures(df_training, A1N_file=A1_north_file, A1S_file = A1_south_file, A6055_file = A6055_file)
    df_training_dist = addRegionFeatures(df_training_dist)

    
    #format data for classifier
    df_training_dist_dropped = df_training_dist.drop(columns=["SpeedAlertsId", "AlertDateTime", "AlertSpeed", "AlertSpeedLimit", "DateTime", "WGS84Lat", "WGS84Long", "Satellites", "SignalStrength", "IsEastA6055", "IsEastA1N", "IsEastA1S", "ClosestPointOnA1S", "ClosestPointOnA1N", "ClosestPointOnA6055"])
    df_numpy_training_dist = df_training_dist_dropped.to_numpy()
    
    print(df_numpy_training_dist)
    

    #create test data
    #for showing accuracy
    df_test = df
    df_testing_dist = addDistanceFeatures(df_test, A1N_file=A1_north_file, A1S_file = A1_south_file, A6055_file = A6055_file)
    df_testing_dist = addRegionFeatures(df_testing_dist)
    df_test_dropped = df_test.drop(columns=["SpeedAlertsId", "AlertDateTime", "AlertSpeed", "AlertSpeedLimit", "DateTime", "WGS84Lat", "WGS84Long", "Satellites", "SignalStrength", "IsEastA6055", "IsEastA1N", "IsEastA1S", "ClosestPointOnA1S", "ClosestPointOnA1N", "ClosestPointOnA6055"])
    df_numpy_testing_dist = df_test_dropped.to_numpy()
    
    #train model on training dataset & test
    gnb = GaussianNB()
    y_pred_real = gnb.fit(df_numpy_training_dist, training_labels).predict(df_numpy_testing_dist)
    y_pred = gnb.fit(df_numpy_training_dist, training_labels).predict(df_numpy_training_dist)
    ytrue = training_labels
    
    #show accuracy of data
    if showaccuracy == True:
        count = 0
        length = len(y_pred)-1
        print(length)
        for index in range(0,length):
            count += y_pred[index] == ytrue[index]
        print("number of correct guesses: " + str(count) + " out of " + str(len(y_pred)))
    
    #show certainty of choice
    y_confidence = gnb.predict_proba(df_numpy_training_dist)
    y_confidence_label_pairs = list(zip(y_pred, y_confidence))
    
    return y_pred_real, ytrue, y_confidence, y_confidence_label_pairs, df_training_dist, df_testing_dist

In [6]:
y_pred, ytrue, y_confidence, y_confidence_label_pairs, df_training_dist, df_testing_dist = NBGaussianClassifier(showaccuracy=True, training_labels="ManuallyLabelledSpeedAlertIdsCharlieWithA6055BiasPoints.txt")

#display results. correctly labelled points printed in green, incorrectly in red
coords = list(zip(df_testing_dist['WGS84Lat'].tolist(), df_testing_dist['WGS84Long'].tolist()))
coords = np.array(coords)

np.savetxt('testManual', y_confidence)

folium_map = folium.Map(location=[54.239084, -1.497210], zoom_start=11)


#used to print test data, showing correctly and incorrectly labelled data
#coords_set = list(zip(coords, y_pred, ytrue))

# for c in coords_set:
#     if c[1] == c[2]:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='green').add_to(folium_map)
#     elif c[1] != c[2]:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='red').add_to(folium_map)


#used to print actual dataset, showing roads allocation (through the colours)
coords_set = list(zip(coords, y_pred))
green_counter = 0
for c in coords_set:
    if c[1] == 0:
        folium.CircleMarker(location=c[0],fill=True,radius=2,color='red').add_to(folium_map)
    elif c[1] == 1:
        folium.CircleMarker(location=c[0],fill=True,radius=2,color='blue').add_to(folium_map)
    elif c[1] == 2:
        folium.CircleMarker(location=c[0],fill=True,radius=2,color='dark green').add_to(folium_map)
        green_counter += 1

print(green_counter)
#folium_map

# #plot confidence histogram
# A1MSouthConfidences = []
# A1MNorthConfidences = []
# A6055Confidences = []
# A1MS = []

# for row in y_confidence_label_pairs:
#     if row[1][0] > row[1][1] and row[1][0] > row[1][2] and row[1][0] < 0.99:
#         A1MSouthConfidences.append(row[1][0])
#     elif row[1][1] > row[1][2] and row[1][1] > row[1][0] and row[1][1] < 0.99:
#         A1MNorthConfidences.append(row[1][1])
#     elif row[1][2] > row[1][0] and row[1][2] > row[1][1] and row[1][2] < 0.99:
#         A1MSouthConfidences.append(row[1][2])
#     #else:
#         #print("error, datapoint fits in no classification")

# print(A1MSouthConfidences)

# plt.figure(figsize=(8,6))
# plt.hist(A1MSouthConfidences, bins=200, alpha=0.5, label="A1MSouthConfidences")
# plt.hist(A1MNorthConfidences, bins=200, alpha=0.5, label="A1MNorthConfidences")
# plt.hist(A6055Confidences, bins=200, alpha=0.5, label="A6055Confidences")

# plt.xlabel("Probability", size=14)
# plt.ylabel("Count", size=14)
# plt.title("Value of best confidence for each datapoint")
# plt.legend(loc='upper right')
# plt.savefig("SpeedingClassificationConfidences10000points25-11-20at1505.png")

afterIDlabelmap[0 0 0 ... 2 2 2]
length: 1320
[0 0 0 ... 2 2 2]


  parameterT = (point[0]-lineseg[0][0]) / (lineseg[1][0] - lineseg[0][0])
  closestPointOnLineSeg = lineseg[0] +  (lineseg[1] - lineseg[0]) * parameterT


[[ 97.555247   150.          16.87424873   2.10585013  61.30866597
    4.        ]
 [ 97.555247   150.          16.87424873   2.10585013  61.30866597
    4.        ]
 [ 97.555247   150.          16.87424873   2.10585013  61.30866597
    4.        ]
 ...
 [ 47.224196   310.          49.87004998  62.03964399   9.4806792
    0.        ]
 [ 67.108068   335.          48.9215679   67.89178498   0.70385232
    1.        ]
 [ 68.35081    335.          37.19266147  57.75430801   2.16482037
    1.        ]]
1319
number of correct guesses: 1284 out of 1320
1
