In [2]:
import pandas as pd

df = pd.read_csv("transformed_output.csv")
df.head()

Unnamed: 0,licencePlate,start_time,end_time,start_lat,start_lon,end_lat,end_lon,travel_time,vehicleTypeId,zipCode,...,type,postcode,zipCodeFixed,end_zipCode,end_postcode,start_area,end_area,route,day_of_week,hour_of_day
0,ce58907,2025-07-22 14:14:49,2025-07-22 15:08:49,55.696949,12.586576,55.696915,12.58656,54,2,2100,...,car,2100,2100,2500.0,2500.0,Kobenhavn O,Valby,Kobenhavn O → Valby,Tuesday,14
1,ce58907,2025-07-22 21:02:01,2025-07-23 06:12:01,55.660629,12.605627,55.660603,12.605632,550,2,2300,...,car,2300,2300,2500.0,2500.0,Kobenhavn S,Valby,Kobenhavn S → Valby,Tuesday,21
2,de88629,2025-07-22 05:49:29,2025-07-22 11:15:29,55.665478,12.557597,55.665382,12.557528,326,34,1718,...,car,1550,1550,2100.0,2100.0,Kobenhavn V,Kobenhavn O,Kobenhavn V → Kobenhavn O,Tuesday,5
3,dr11638,2025-07-23 07:50:01,2025-07-23 08:36:01,55.68314,12.58534,55.683186,12.585388,46,34,1301,...,car,1050,1050,2300.0,2300.0,Kobenhavn K,Kobenhavn S,Kobenhavn K → Kobenhavn S,Wednesday,7
4,dr11690,2025-07-22 11:34:48,2025-07-22 12:16:48,55.686272,12.538904,55.686321,12.538924,42,34,2000,...,car,2000,2000,2100.0,2100.0,Frederiksberg C,Kobenhavn O,Frederiksberg C → Kobenhavn O,Tuesday,11


In [12]:
# Data type check
df["start_time"] = pd.to_datetime(df["start_time"])
df["end_time"] = pd.to_datetime(df["end_time"])

numeric_cols = [
    'start_lat', 'start_lon', 'end_lat', 'end_lon', 'travel_time', 'vehicleTypeId', 'zipCode', 'zipCodeFixed', 
    'end_zipCode', 'start_postcode' if 'start_postcode' in df else 'postcode', 'end_postcode'
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df.dtypes

licencePlate             object
start_time       datetime64[ns]
end_time         datetime64[ns]
start_lat               float64
start_lon               float64
end_lat                 float64
end_lon                 float64
travel_time               int64
vehicleTypeId             int64
zipCode                   int64
model                    object
type                     object
postcode                  int64
zipCodeFixed              int64
end_zipCode             float64
end_postcode            float64
start_area               object
end_area                 object
route                    object
day_of_week              object
hour_of_day               int64
dtype: object

In [10]:
# Missing values per column
df.isna().sum()

licencePlate     0
start_time       0
end_time         0
start_lat        0
start_lon        0
end_lat          0
end_lon          0
travel_time      0
vehicleTypeId    0
zipCode          0
model            0
type             0
postcode         0
zipCodeFixed     0
end_zipCode      0
end_postcode     0
start_area       0
end_area         0
route            0
day_of_week      0
hour_of_day      0
dtype: int64

In [11]:
# Checking for duplicate rows
duplicates = df[df.duplicated(subset=['start_time', 'end_time', 'start_lat', 'start_lon', 'end_lat', 'end_lon', 'travel_time', 'vehicleTypeId'], keep=False)]
duplicates

Unnamed: 0,licencePlate,start_time,end_time,start_lat,start_lon,end_lat,end_lon,travel_time,vehicleTypeId,zipCode,...,type,postcode,zipCodeFixed,end_zipCode,end_postcode,start_area,end_area,route,day_of_week,hour_of_day


In [9]:
# Outlier detection in travel time (negative or 0? Are there any abnormally long travel times?)

df[df['travel_time'] <= 0]
df[df['travel_time'] > df['travel_time'].quantile(0.99)]

Unnamed: 0,licencePlate,start_time,end_time,start_lat,start_lon,end_lat,end_lon,travel_time,vehicleTypeId,zipCode,...,type,postcode,zipCodeFixed,end_zipCode,end_postcode,start_area,end_area,route,day_of_week,hour_of_day
24,dx15907,2025-07-24 04:18:01,2025-07-25 13:06:01,55.665512,12.551309,55.665535,12.551169,1968,34,1727,...,car,1550,1550,2300.0,2300.0,Kobenhavn V,Kobenhavn S,Kobenhavn V → Kobenhavn S,Thursday,4
314,ea56790,2025-09-09 06:49:08,2025-09-10 06:51:08,55.667271,12.51838,55.667324,12.518453,1442,34,1805,...,car,2000,2000,2000.0,2000.0,Frederiksberg C,Frederiksberg C,Frederiksberg C → Frederiksberg C,Tuesday,6
326,dr29958,2025-09-10 23:45:09,2025-09-12 12:05:08,55.671165,12.552011,55.671215,12.552099,2180,6,1661,...,van,1550,1550,2100.0,2100.0,Kobenhavn V,Kobenhavn O,Kobenhavn V → Kobenhavn O,Wednesday,23
377,dv53344,2025-09-18 01:43:08,2025-09-19 07:39:08,55.641148,12.599516,55.641148,12.599554,1796,74,2300,...,van,2300,2300,2400.0,2400.0,Kobenhavn S,Kobenhavn NV,Kobenhavn S → Kobenhavn NV,Thursday,1
501,dy44085,2025-10-02 06:23:08,2025-10-03 07:05:09,55.672153,12.513474,55.672119,12.513467,1482,34,2000,...,car,2000,2000,2000.0,2000.0,Frederiksberg C,Frederiksberg C,Frederiksberg C → Frederiksberg C,Thursday,6
557,dx15903,2025-10-06 13:31:08,2025-10-07 15:53:09,55.66198,12.506403,55.661964,12.506384,1582,34,2500,...,car,2500,2500,2400.0,2400.0,Valby,Kobenhavn NV,Valby → Kobenhavn NV,Monday,13
567,dy74220,2025-10-07 15:39:09,2025-10-09 06:55:08,55.65844,12.514589,55.658348,12.514471,2356,34,2500,...,car,2500,2500,2100.0,2100.0,Valby,Kobenhavn O,Valby → Kobenhavn O,Tuesday,15
582,dz55103,2025-10-09 16:21:09,2025-10-11 04:35:08,55.662209,12.608646,55.662174,12.608854,2174,34,2300,...,car,2300,2300,2000.0,2000.0,Kobenhavn S,Frederiksberg C,Kobenhavn S → Frederiksberg C,Thursday,16
727,cx86742,2025-10-24 14:27:08,2025-10-25 13:47:08,55.702637,12.498285,55.702686,12.498295,1400,34,2700,...,car,2700,2700,2720.0,2720.0,Bronshoj,Vanlose,Bronshoj → Vanlose,Friday,14
814,ce58927,2025-11-01 11:41:08,2025-11-02 10:57:08,55.676189,12.617399,55.67622,12.617412,1396,2,2300,...,car,2300,2300,2200.0,2200.0,Kobenhavn S,Kobenhavn N,Kobenhavn S → Kobenhavn N,Saturday,11


In [14]:
# Checking for trail travel times

Q1 = df['travel_time'].quantile(0.25)
Q3 = df['travel_time'].quantile(0.75)
IQR = Q3 - Q1

travel_time_outliers = df[
    (df['travel_time'] < Q1 - 1.5 * IQR) |
    (df['travel_time'] > Q3 + 1.5 * IQR)
    ]

travel_time_outliers

Unnamed: 0,licencePlate,start_time,end_time,start_lat,start_lon,end_lat,end_lon,travel_time,vehicleTypeId,zipCode,...,type,postcode,zipCodeFixed,end_zipCode,end_postcode,start_area,end_area,route,day_of_week,hour_of_day
12,ea56801,2025-07-21 13:25:29,2025-07-22 05:25:29,55.673397,12.531873,55.673397,12.531938,960,34,2000,...,car,2000,2000,2400.0,2400.0,Frederiksberg C,Kobenhavn NV,Frederiksberg C → Kobenhavn NV,Monday,13
24,dx15907,2025-07-24 04:18:01,2025-07-25 13:06:01,55.665512,12.551309,55.665535,12.551169,1968,34,1727,...,car,1550,1550,2300.0,2300.0,Kobenhavn V,Kobenhavn S,Kobenhavn V → Kobenhavn S,Thursday,4
47,dy44149,2025-08-12 15:09:08,2025-08-13 04:59:08,55.698780,12.529724,55.698853,12.529696,830,34,2400,...,car,2400,2400,2300.0,2300.0,Kobenhavn NV,Kobenhavn S,Kobenhavn NV → Kobenhavn S,Tuesday,15
54,cb23361,2025-08-14 17:07:08,2025-08-15 07:43:09,55.662807,12.517796,55.662743,12.517914,876,2,2500,...,car,2500,2500,2000.0,2000.0,Valby,Frederiksberg C,Valby → Frederiksberg C,Thursday,17
75,bp20016,2025-08-17 15:39:08,2025-08-18 07:27:09,55.670418,12.584532,55.670483,12.584550,948,2,1411,...,car,1050,1050,2500.0,2500.0,Kobenhavn K,Valby,Kobenhavn K → Valby,Sunday,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,cx86700,2025-08-07 08:41:08,2025-08-08 07:33:08,55.698940,12.551180,55.698853,12.551180,1372,34,2200,...,car,2200,2200,2100.0,2100.0,Kobenhavn N,Kobenhavn O,Kobenhavn N → Kobenhavn O,Thursday,8
1100,dv53345,2025-08-07 19:21:09,2025-08-08 12:59:08,55.662132,12.512162,55.662117,12.512150,1058,74,2500,...,van,2500,2500,2300.0,2300.0,Valby,Kobenhavn S,Valby → Kobenhavn S,Thursday,19
1101,ea56797,2025-08-06 17:41:08,2025-08-07 09:31:08,55.692104,12.541249,55.692051,12.541138,950,34,2200,...,car,2200,2200,2400.0,2400.0,Kobenhavn N,Kobenhavn NV,Kobenhavn N → Kobenhavn NV,Wednesday,17
1119,ea75181,2025-08-08 18:03:08,2025-08-09 08:49:08,55.664829,12.554106,55.664753,12.554111,886,34,1725,...,car,1550,1550,2300.0,2300.0,Kobenhavn V,Kobenhavn S,Kobenhavn V → Kobenhavn S,Friday,18


In [16]:
# Coordinate outliers

min_lat, max_lat = 55.00, 56.00
min_lon, max_lon = 12.00, 13.00

coord_outliers = df[
    (df['start_lat'] < min_lat) | (df['start_lat'] > max_lat) |
    (df['end_lat'] < min_lat) | (df['end_lat'] > max_lat) |
    (df['start_lon'] < min_lon) | (df['start_lon'] > max_lon) |
    (df['end_lon'] < min_lon) | (df['end_lon'] > max_lon)
    ]

coord_outliers

Unnamed: 0,licencePlate,start_time,end_time,start_lat,start_lon,end_lat,end_lon,travel_time,vehicleTypeId,zipCode,...,type,postcode,zipCodeFixed,end_zipCode,end_postcode,start_area,end_area,route,day_of_week,hour_of_day
