In [1]:
"""
Notebook that takes training and test data and runs the Naive Bayes Classifier to label test data (determine which road
the car is on)
"""

'\nNotebook that takes training and test data and runs the Naive Bayes Classifier to label test data (determine which road\nthe car is on)\n'

In [2]:
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas
import dataReader
import csv_parser
from road_features_no_gpscoords import addDistanceFeatures
from road_features_no_gpscoords import coordDistance
from road_features_v2Regions import addRegionFeaturesV2
from road_features_no_gpscoords import addRegionFeatures
from road_features_v2Regions import addDistanceFeaturesV2

import folium
from folium.plugins import HeatMap

In [3]:
"""
import required data & set coordinate file names & parameters
"""
df = dataReader.getData(path="Bristol 3rd year speed limit data.tsv", speedInKmh = False)
# longlatBox=[-1.573801, -1.530819, 54.276656, 54.311298]
# lat long for RAF Leeming

# longlatBox=[-1.524019, -1.508999, 54.252225, 54.269857]
# lat long for near Burneston

# longlatBox=[-1.491493, -1.482158, 54.218319, 54.230049]
# lat long for lime lane roundabout

A1_width = 14
A6055_width = 8

A1_north_file = "A1_northbound_coordinates.csv"
A1_south_file = "A1_southbound_coordinates.csv"
A6055_file = "A6055_coordinates.csv"

In [4]:
"""
generate data set
"""
def data_generation():
    
    seventy_mph_mu = 68
    seventy_mph_sigma = 9.785

    sixty_mph_mu = 50
    sixty_mph_sigma = 8.6

    distance_sigma = 30

    A6055_to_A1MN_mu = 61.41406461739851
    A6055_to_A1MS_mu = 70.24264523614933
    A6055_to_A6055_mu = 0

    A1MN_to_A1MN_mu = 0
    A1MN_to_A1MS_mu = 21.623846460850338
    A1MN_to_A6055_mu = -44.808830312711464

    A1MS_to_A1MN_mu = -26.457035644259744
    A1MS_to_A1MS_mu = 0
    A1MS_to_A6055_mu = -67.69968916116729

    A1M_points = 34459*10
    A6055_points = 4047*10

    dataLabels = []
    data = np.zeros((A1M_points+A6055_points, 5))

    data[0:A1M_points, 0] = np.random.normal(seventy_mph_mu, seventy_mph_sigma, A1M_points)

    data[int(A1M_points/2)+1:A1M_points, 1] = np.random.normal(A1MN_to_A1MS_mu, distance_sigma, int(A1M_points/2)-1)
    data[int(A1M_points/2)+1:A1M_points, 2] = np.random.normal(A1MS_to_A1MS_mu, distance_sigma, int(A1M_points/2)-1)
    data[int(A1M_points/2)+1:A1M_points, 3] = np.random.normal(A6055_to_A1MS_mu, distance_sigma, int(A1M_points/2)-1)
    data[int(A1M_points/2)+1:A1M_points, 4] = 0

    data[0:int(A1M_points/2), 1] = np.random.normal(A1MN_to_A1MN_mu, distance_sigma, int(A1M_points/2))
    data[0:int(A1M_points/2), 2] = np.random.normal(A1MS_to_A1MN_mu, distance_sigma, int(A1M_points/2))
    data[0:int(A1M_points/2), 3] = np.random.normal(A6055_to_A1MN_mu, distance_sigma, int(A1M_points/2))
    data[0:int(A1M_points/2), 4] = 1

    data[A1M_points:A1M_points+A6055_points, 0] = np.random.normal(sixty_mph_mu, sixty_mph_sigma, A6055_points)
    data[A1M_points:A1M_points+A6055_points, 1] = np.random.normal(A1MN_to_A6055_mu, distance_sigma, A6055_points)
    data[A1M_points:A1M_points+A6055_points, 2] = np.random.normal(A1MS_to_A6055_mu, distance_sigma, A6055_points)
    data[A1M_points:A1M_points+A6055_points, 3] = np.random.normal(A6055_to_A6055_mu, distance_sigma, A6055_points)
    data[A1M_points:A1M_points+A6055_points, 4] = 2

    return data

In [14]:
"""
Split's pandas dataframe speeding data from df and create's training and testing numpy arrays. Label's training data as on
the M1 south, M1 north or A6055
here training data is from data_generation

param: df_name = name of csv speeding data for labelling
param: training_labels = data from data_generation
"""
def NBGaussianClassifier(training_set, showaccuracy=False, quality_testing_labels="ManuallyLabelledSpeedAlertIdsCharlieWithA6055BiasPoints.txt", df_name="Bristol 3rd year speed limit data.tsv", A1_north_file="A1_northbound_coordinates.csv", A1_south_file="A1_southbound_coordinates.csv", A6055_file="A6055_coordinates.csv"):
    
    #extract quality checking test data from manual data collection file (for checking models accuracy)
    file_qt_Labels = open(quality_testing_labels,"r").readlines()
    qt_labels = []
    qt_labelledIds = []
    for row in file_qt_Labels:
        newrow = row.strip().split("\t")
        qt_labelledIds.append(int(newrow[0]))
        qt_labels.append(int(newrow[1][0]))

        
    #generate quality checking testing data (from manually labelled data)
    quality_testing_df_full = dataReader.getData(path="Bristol 3rd year speed limit data.tsv", speedAlertIds=qt_labelledIds, speedInKmh = False)
    quality_testing_df_with_features = addDistanceFeaturesV2(quality_testing_df_full, A1N_file=A1_north_file, A1S_file = A1_south_file, A6055_file = A6055_file)
    #quality_testing_df_with_features = addRegionFeaturesV2(quality_testing_df_with_features)
    quality_testing_df_test_dropped = quality_testing_df_with_features.drop(columns=["Heading", "SpeedAlertsId", "AlertDateTime", "AlertSpeed", "AlertSpeedLimit", "DateTime", "WGS84Lat", "WGS84Long", "Satellites", "SignalStrength", "ClosestPointOnA1S", "ClosestPointOnA1N", "ClosestPointOnA6055"])
    quality_testing_df = quality_testing_df_test_dropped.to_numpy()

    #extract training data and labels
    training_labels = training_set["PredictedLabel"]
    training_dists_and_speeds = training_set.loc[:, training_set.columns != 'PredictedLabel']
    #training_dists_and_speeds = addRegionFeaturesV2(training_dists_and_speeds)   
    
    print("Training dist and speeds (model trained on this set)")
    print(training_dists_and_speeds)
    print("Training dist and speeds labels")
    print(training_labels)
    
    #create test data (to test model on)
    df_test = quality_testing_df_full
#     df_testing_dist = addDistanceFeaturesV2(df_test, A1N_file=A1_north_file, A1S_file = A1_south_file, A6055_file = A6055_file)
    df_testing_with_features = quality_testing_df_with_features #df_testing_with_features = addRegionFeaturesV2(df_testing_dist)
#     df_test_dropped = df_testing_with_features.drop(columns=["Heading", "SpeedAlertsId", "AlertDateTime", "AlertSpeed", "AlertSpeedLimit", "DateTime", "WGS84Lat", "WGS84Long", "Satellites", "SignalStrength", "ClosestPointOnA1S", "ClosestPointOnA1N", "ClosestPointOnA6055"])
#     df_numpy_testing_dist = df_test_dropped.to_numpy()
    df_numpy_testing_dist = quality_testing_df

    #train model on training dataset & test
    gnb = GaussianNB()
    y_pred = gnb.fit(training_dists_and_speeds, training_labels).predict(df_numpy_testing_dist)
    y_pred_quality_test = gnb.fit(training_dists_and_speeds, training_labels).predict(quality_testing_df)
    ytrue = qt_labels

    #show accuracy of data using quality training dataset
    if showaccuracy == True:
        count = 0
        length = len(y_pred_quality_test)
        print(length)
        for index in range(0,length):
            count += y_pred_quality_test[index] == ytrue[index]
        print("number of correct guesses: " + str(count) + " out of " + str(len(y_pred_quality_test)))

    #show certainty of choice
    y_confidence = gnb.predict_proba(df_numpy_testing_dist)
    y_confidence_labels = df_test["SpeedAlertsId"].tolist()
    #y_confidence_label_pairs = list(zip(y_pred, y_confidence))

    return y_pred, ytrue, y_confidence, y_confidence_labels, training_dists_and_speeds, df_testing_with_features, df_numpy_testing_dist

In [6]:
df_training_data = data_generation()

#format data to df
df_training_data = pandas.DataFrame({"PredictedLabel": df_training_data[:,4], "Speed": df_training_data[:,0], "DistanceA1N": df_training_data[:,1], "DistanceA1S": df_training_data[:,2], "DistanceA6055": df_training_data[:,3]})
print(df_training_data)

        PredictedLabel      Speed  DistanceA1N  DistanceA1S  DistanceA6055
0                  1.0  59.147692   -41.687312   -15.615365      61.432836
1                  1.0  78.719525     5.881987   -15.719499      66.235575
2                  1.0  56.880788     8.076844   -83.495740      65.736724
3                  1.0  76.860112   -32.430789   -36.705509      61.080352
4                  1.0  84.549705   -44.454475   -37.355693      34.150296
...                ...        ...          ...          ...            ...
385055             2.0  45.953449   -34.938186   -55.158710     -35.882110
385056             2.0  39.837155   -67.940122   -48.291526     -41.896326
385057             2.0  57.528710   -60.087642   -81.393684     -14.507037
385058             2.0  53.227934   -49.346684   -51.160572      11.689950
385059             2.0  52.978516   -24.145232   -59.404772      76.595546

[385060 rows x 5 columns]


In [15]:
y_pred, ytrue, y_confidence, y_confidence_labels, df_training_dist, df_testing_with_features, df_numpy_testing_dist = NBGaussianClassifier(showaccuracy=True, training_set=df_training_data)

#display results. correctly labelled points printed in green, incorrectly in red
coords = list(zip(df_testing_with_features['WGS84Lat'].tolist(), df_testing_with_features['WGS84Long'].tolist()))#########
coords = np.array(coords)

np.savetxt('GeneratedDataNoRegionsConfidenceOnTestingDatay_pred', y_pred)
np.savetxt('GeneratedDataNoRegionsConfidenceOnTestingDatay_confidence', y_confidence)
np.savetxt('GeneratedDataNoRegionsConfidenceOnTestingDatay_confidence_labels', y_confidence_labels)
# np.savetxt('testGeneratedRegionsdf_training_dist', df_training_dist)
# np.savetxt('testGeneratedRegionsdf_numpy_testing_dist', df_numpy_testing_dist)

#folium_map = folium.Map(location=[54.239084, -1.497210], zoom_start=11)###########


#used to print test data, showing correctly and incorrectly labelled data
# coords_set = list(zip(coords, y_pred, ytrue))

# for c in coords_set:
#     if c[1] == c[2]:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='green').add_to(folium_map)
#     elif c[1] != c[2]:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='red').add_to(folium_map)


#used to print actual dataset, showing roads allocation (through the colours)
# coords_set = list(zip(coords, y_pred))##############
# green_counter = 0
# for c in coords_set:
#     if c[1] == 0:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='red').add_to(folium_map)
#     elif c[1] == 1:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='blue').add_to(folium_map)
#     elif c[1] == 2:
#         folium.CircleMarker(location=c[0],fill=True,radius=2,color='green').add_to(folium_map)
#         green_counter += 1

# print(green_counter)
#folium_map#################

# #plot confidence histogram
# A1MSouthConfidences = []
# A1MNorthConfidences = []
# A6055Confidences = []
# A1MS = []

# for row in y_confidence_label_pairs:
#     if row[1][0] > row[1][1] and row[1][0] > row[1][2] and row[1][0] < 0.99:
#         A1MSouthConfidences.append(row[1][0])
#     elif row[1][1] > row[1][2] and row[1][1] > row[1][0] and row[1][1] < 0.99:
#         A1MNorthConfidences.append(row[1][1])
#     elif row[1][2] > row[1][0] and row[1][2] > row[1][1] and row[1][2] < 0.99:
#         A1MSouthConfidences.append(row[1][2])
#     #else:
#         #print("error, datapoint fits in no classification")

# #print(A1MSouthConfidences)

# plt.figure(figsize=(8,6))
# plt.hist(A1MSouthConfidences, bins=200, alpha=0.5, label="A1MSouthConfidences", color='red')
# plt.hist(A1MNorthConfidences, bins=200, alpha=0.5, label="A1MNorthConfidences", color='blue')
# plt.hist(A6055Confidences, bins=200, alpha=0.5, label="A6055Confidences", color='darkgreen')

# plt.xlabel("Probability", size=14)
# plt.ylabel("Count", size=14)
# plt.title("Value of best confidence for each datapoint")
# plt.legend(loc='upper right')
# plt.savefig("SpeedingClassificationConfidences10000points25-11-20at1505.png")

  parameterT = (point[0] - lineseg[0][0]) / (lineseg[1][0] - lineseg[0][0])
  closestPointOnLineSeg = lineseg[0] + (lineseg[1] - lineseg[0]) * parameterT


Training dist and speeds (model trained on this set)
            Speed  DistanceA1N  DistanceA1S  DistanceA6055
0       59.147692   -41.687312   -15.615365      61.432836
1       78.719525     5.881987   -15.719499      66.235575
2       56.880788     8.076844   -83.495740      65.736724
3       76.860112   -32.430789   -36.705509      61.080352
4       84.549705   -44.454475   -37.355693      34.150296
...           ...          ...          ...            ...
385055  45.953449   -34.938186   -55.158710     -35.882110
385056  39.837155   -67.940122   -48.291526     -41.896326
385057  57.528710   -60.087642   -81.393684     -14.507037
385058  53.227934   -49.346684   -51.160572      11.689950
385059  52.978516   -24.145232   -59.404772      76.595546

[385060 rows x 4 columns]
Training dist and speeds labels
0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
385055    2.0
385056    2.0
385057    2.0
385058    2.0
385059    2.0
Name: PredictedLabel, Leng