In [33]:
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

## Reading all tracks files as one dataframe   

In [2]:
tracksPath = "C:\\Users\\hp\\Desktop\\AI\\roundD\\00_tracks.csv"
tracks = pd.read_csv(tracksPath)
#Give an initial value to "counter" to be able to set a unique id for each track   
counter = tracks["trackId"].max() + 1
for i in range(24):
    if i == 0:
        continue         
    if i < 10:                                                 
        tracksPath = tracksPath.replace('0'+str(i-1), '0'+str(i))
    if i >= 10: 
        tracksPath = tracksPath.replace(str(i-1), str(i))
    if i == 10: 
        tracksPath = tracksPath.replace('0'+str(i), str(i))
    toJoinFile = pd.read_csv(tracksPath)
    #Give a unique ID for each track who has a previously used  one                    
    for j in range(toJoinFile["trackId"].max() + 1):
        toJoinFile["trackId"].replace({j : counter}, inplace = True)        
        counter = counter + 1
    tracks = pd.concat([tracks, toJoinFile], axis = 0)

## Standard deviation of speed (DV1)

In [3]:
DV1 = tracks.groupby(["trackId"], sort=False)['xVelocity'].std()
DV1 = pd.DataFrame(DV1)
DV1.rename(columns = {'xVelocity':'DV1'}, inplace = True)

### Initiate measures, a dataframe that will contain the volatility measures for each driver where each trackId equals to its index on the dataFrame

In [4]:
trackID = pd.DataFrame(columns = {"trackId"}) 
trackID["trackId"] = tracks["trackId"].unique()
measures = trackID.copy()                                    
measures = pd.concat([measures, DV1], axis = 1)

## Standard deviation of longitudinal deceleration or acceleration (DV2)

In [5]:
lonAcce = tracks.groupby(["trackId"], sort=False)['lonAcceleration'].std()
DV2 = pd.DataFrame(lonAcce)
DV2.rename(columns = {'lonAcceleration':'DV2'}, inplace = True)
measures = pd.concat([measures, DV2], axis = 1)

## Coefficient of variation of speed (DV3)

In [6]:
mean = tracks.groupby(["trackId"], sort=False)['xVelocity'].mean()
DV3 = pd.DataFrame(columns = {"DV3"})
#Check if the mean for the selected driver = 0, set DV3 for him to 0
for i in range(mean.shape[0]):
    if(mean.iloc[i] == 0):
        DV3.loc[i, "DV3"] = 0
    else:
        DV3.loc[i, "DV3"] = (DV1.loc[i, "DV1"] / mean.iloc[i])*100
#
measures = pd.concat([measures, DV3], axis = 1)

## Coefficient of variation of longitudinal acceleration (DV4)

In [7]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV4"})], axis = 1)
size = tracks.groupby(["trackId"], sort = False).size()
for i in range(size.shape[0]):
    #Get the longitudinal acceleration for each driver
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    #Get the acceleration (0 or positive) values
    lonAcce = group[group >= 0]
    mean = lonAcce.mean()
    std = lonAcce.std()
    #Check if the mean for the selected driver = 0, set DV4 for him to 0
    if mean == 0:
        DV4 = 0
    else:
        #Calculate the DV4 for each driver 
        DV4 = std / mean * 100
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV4"] = DV4

## Coefficient of variation of longitudinal deceleration (DV5)

In [8]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV5"})], axis = 1)
size = tracks.groupby(["trackId"], sort = False).size()
for i in range(size.shape[0]):
    #Get the longitudinal acceleration for each driver
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    #Get the deceleration values
    lonDece = group[group < 0]
    mean = lonDece.mean()
    std = lonDece.std()
    DV5 = std / mean * 100
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV5"] = DV5

## Mean absolute deviation of speed (DV6)

In [11]:
DV6 = tracks.groupby(["trackId"], sort=False)['xVelocity'].mad()
DV6 = pd.DataFrame(DV6)   
DV6.rename(columns = {'xVelocity':'DV6'}, inplace = True)
measures = pd.concat([measures, DV6], axis = 1)               

## Mean absolute deviation of longitudinal acceleration (DV7)

In [16]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV7"})], axis = 1) 
#measures.shape[0] represents number of tracks
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    #Get the acceleration (0 or positive) values
    lonAcce = group[group >= 0]
    mad = lonAcce.mad() 
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV7"] = mad

## Quantile coefficient of variation of normalised speed (DV8)

In [17]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV8"})], axis = 1) 
#measures.shape[0] represents number of tracks
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["xVelocity"]   
    Q1 = group.quantile(q=0.25)
    Q3 = group.quantile(q=0.75)          
    Denominator = Q3 + Q1
    IQR = Q3 - Q1
    #Check if the Denominator for the selected driver = 0, set DV8 for him to 0
    if(Denominator == 0):
        DV8 = 0
    else:
        DV8 = 100 * ( IQR / Denominator )
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV8"] = DV8          

## Quantile coefficient of variation of longitudinal acceleration (DV9)

In [25]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV9"})], axis = 1)
#measures.shape[0] represents number of tracks
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    lonAcce = group[group >= 0]
    Q1 = lonAcce.quantile(q=0.25)
    Q3 = lonAcce.quantile(q=0.75)
    Denominator = Q3 + Q1
    IQR = Q3 - Q1
    #Check if the Denominator for the selected driver = 0, set DV9 for him to 0
    if(Denominator == 0):
        DV9 = 0
    else:
        DV9 = 100 * ( IQR / Denominator )
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV9"] = DV9 

## Quantile coefficient of variation of longitudinal deceleration (DV10)

In [26]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV10"})], axis = 1)
#measures.shape[0] represents number of tracks
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    lonDece = group[group < 0]
    Q1 = lonDece.quantile(q=0.25)
    Q3 = lonDece.quantile(q=0.75)
    Denominator = Q3 + Q1
    IQR = Q3 - Q1
    DV10 = 100 * ( IQR / Denominator )
    #Assign the calculated value for each driver into measures dataFrame according to the trackID
    measures.loc[i, "DV10"] = DV10   

## Percentage of time the mean normalised speed exceeds the mean plus two standard deviations (DV11)

In [27]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV11"})], axis = 1)
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]
    #Size represents N in the document(number of records for each driver)     
    size = tracks[tracks["trackId"] == i].shape[0]
    #greaters represents values that exceeds the mean plus 2standard deviation
    greaters = group[group["xVelocity"]>= (group["xVelocity"].mean() + 2 * DV1["DV1"].iloc[i])] 
    result = 100 * greaters["xVelocity"].sum() / size  
    measures.loc[i, "DV11"] = result   

## Percentage of time the mean of longitudinal acceleration exceeds the mean plus two standard deviations (DV12)

In [29]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV12"})], axis = 1)
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    lonAcce = group[group >= 0]
    #greaters represents values that exceeds the mean plus 2standard deviation
    greaters = lonAcce[lonAcce >= (lonAcce.mean() + 2 * DV2["DV2"].iloc[i])]
    #Check if the driver has no acceleration values set the DV12 to 0
    if(lonAcce.shape[0] == 0):
        result = 0
    else:
        result = 100 * greaters.sum() / lonAcce.shape[0]
    measures.loc[i, "DV12"] = result                      

## Percentage of time the mean longitudinal decelerationexceeds the mean plus two standard deviation (DV13)

In [30]:
measures = pd.concat([measures, pd.DataFrame(columns = {"DV13"})], axis = 1)   
for i in range(measures.shape[0]):
    group = tracks[tracks["trackId"] == i]["lonAcceleration"]
    DLong = group[group < 0]
    greaters = DLong[DLong >= (DLong.mean() + 2 * DV2["DV2"].iloc[i])]  
    #Check if the driver has no deceleration set the DV13 to 0
    if(DLong.shape[0] == 0):
        result = 0
    else:
        result = 100 * greaters.sum() / DLong.shape[0]
    measures.loc[i, "DV13"] = result 

In [31]:
measures.fillna(0, inplace = True)

## Find here the measures for each track according its id.

In [32]:
measures              

Unnamed: 0,trackId,DV1,DV2,DV3,DV4,DV5,DV6,DV7,DV8,DV9,DV10,DV11,DV12,DV13
0,0,0.084400,0.137488,-21.295609,138.783757,-70.151040,0.065205,0.089968,-11.425604,72.475692,-56.643357,0.000000,3.989846,0.0
1,1,0.873065,0.344838,-4460.998760,148.256400,-93.473968,0.726213,0.311566,448.413623,78.445596,-69.155527,0.000000,14.951366,0.0
2,2,0.100230,0.041817,-23.710129,89.884885,-67.068519,0.080541,0.024299,-13.341298,57.645317,-60.791706,-1.016770,0.603692,0.0
3,3,2.177446,0.327409,62.683505,103.518642,-54.175010,2.053817,0.320348,56.966527,69.835057,-45.337505,0.000000,14.838767,0.0
4,4,2.419873,0.285022,60.581990,113.601191,-98.174508,2.272445,0.217501,55.724036,93.431682,-72.777421,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13741,13741,3.202210,0.383552,-45.123800,41.990634,0.000000,2.865652,0.330930,-36.928886,37.517770,0.000000,0.000000,0.000000,0.0
13742,13742,1.249508,0.507402,-21.311174,86.372016,-70.444679,1.135046,0.388824,-19.219279,84.889435,-76.254826,0.000000,0.000000,0.0
13743,13743,0.821811,0.489507,16.652258,49.299220,-38.250156,0.645977,0.280999,10.566362,41.749175,-19.692460,45.241465,0.000000,0.0
13744,13744,1.215077,1.469314,-43.830366,45.690387,-54.594786,1.024770,0.520836,-39.548579,34.988223,-48.528795,0.000000,0.000000,0.0


## Reading all tracksMeta files as one dataframe

In [36]:
tracksMetaPath = "C:\\Users\\hp\\Desktop\\AI\\roundD\\00_tracksMeta.csv"     
tracksMeta = pd.read_csv(tracksMetaPath)
counter = tracksMeta["trackId"].max() + 1
for i in range(24):
    if i == 0:
        continue
    if i < 10:
        tracksMetaPath = tracksMetaPath.replace('0'+str(i-1), '0'+str(i))
    if i >= 10: 
        tracksMetaPath = tracksMetaPath.replace(str(i-1), str(i))
    if i == 10: 
        tracksMetaPath = tracksMetaPath.replace('0'+str(i), str(i))
    toJoinFile = pd.read_csv(tracksMetaPath)
    #Give a unique ID for each track who has a previously used  one                    
    for j in range(toJoinFile["trackId"].max() + 1):
        toJoinFile["trackId"].replace({j : counter}, inplace = True)        
        counter = counter + 1
    tracksMeta = pd.concat([tracksMeta, toJoinFile], axis = 0)         

In [37]:
#data represents the volatility measures for drivers
data = measures.copy()   

### Remove pedestrian, bicycles and motorcycles from tracksMeta to get vehicles id only

In [39]:
tracksMeta = tracksMeta[tracksMeta["class"] != 'pedestrian']
tracksMeta = tracksMeta[tracksMeta["class"] != 'motorcycle']
tracksMeta = tracksMeta[tracksMeta["class"] != 'bicycle']

In [40]:
#Get the volatility measures for vehicles only 
finalData = pd.merge(tracksMeta["trackId"], data)         

In [41]:
#Removing outliers     
finalData = finalData.drop(index = 405)
finalData = finalData.drop(index = 11062)

In [43]:
#Drop trackId column to pass the dataframe into the model
inputData = finalData.drop(columns = "trackId")    

In [44]:
model = KMeans(n_clusters = 3)
model.fit(inputData) 
Centroids = model.cluster_centers_

In [46]:
secondModel = KMeans(n_clusters = 2)
secondModel.fit(inputData)  
CentroidsForSecondModel = secondModel.cluster_centers_ 