# Data cleaning and analysis

In [1]:
import os
import visuals
import constants
import functionals
import pandas as pd
from pprint import pprint


In [2]:
data_path = "assignment-sample-data.csv"
df = pd.read_csv(data_path)

In [3]:
from IPython.display import display
pd.set_option('display.width', 1000)

In [4]:
print(df.head())

        parking_id   area_type   parking_start_time     parking_end_time  parking_fee currency   parkinguser_id           car_id        lat        lon account_type
0  fake_c28a323810  SurfaceLot  2015-03-06 19:55:41  2015-03-06 20:07:00         8.50      SEK  fake_bf5d9b530e  fake_130ae2aeb1  59.246370  18.077019    corporate
1  fake_76c21cf355  SurfaceLot  2015-03-06 18:08:20  2015-03-06 19:46:00        15.67      SEK  fake_bf5d9b530e  fake_130ae2aeb1  59.231789  18.083995    corporate
2  fake_995ed971a6    OnStreet  2017-07-21 09:55:42  2017-07-21 14:23:50        67.00      SEK  fake_3ba346a0cd  fake_f7a9d564d9  59.350331  18.096649    corporate
3  fake_6b81ea4f35  SurfaceLot  2017-07-24 07:21:12  2017-07-24 07:34:31         4.34      SEK  fake_ea19a50003  fake_fae7e31b34  59.315826  18.098355    corporate
4  fake_424b61e0eb  SurfaceLot  2015-03-09 12:05:46  2015-03-09 13:57:54        50.50      SEK  fake_1cc1970582  fake_0755f3c71f  59.320919  18.047513    corporate


In [5]:
print(df.columns)

Index(['parking_id', 'area_type', 'parking_start_time', 'parking_end_time', 'parking_fee', 'currency', 'parkinguser_id', 'car_id', 'lat', 'lon', 'account_type'], dtype='object')


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   parking_id          87489 non-null  object 
 1   area_type           87489 non-null  object 
 2   parking_start_time  87489 non-null  object 
 3   parking_end_time    87489 non-null  object 
 4   parking_fee         87489 non-null  float64
 5   currency            87489 non-null  object 
 6   parkinguser_id      87489 non-null  object 
 7   car_id              87489 non-null  object 
 8   lat                 87489 non-null  float64
 9   lon                 87489 non-null  float64
 10  account_type        87489 non-null  object 
dtypes: float64(3), object(8)
memory usage: 7.3+ MB
None


In [7]:
print(df.describe())

        parking_fee           lat           lon
count  87489.000000  87489.000000  87489.000000
mean      31.786153     58.786174     17.035432
std       39.338515      2.925073      2.639340
min        0.000000   -180.006219   -180.006783
25%        9.000000     59.292461     17.137638
50%       18.750000     59.332389     18.000587
75%       39.100000     59.360741     18.066440
max      983.000000     67.871132     54.009971


In [8]:
print(df["account_type"].unique())
print(df["account_type"].value_counts())
print(df["currency"].unique())
print(df["currency"].value_counts())
print(df["area_type"].unique())
print(df["area_type"].value_counts())


['corporate' 'private']
account_type
private      57239
corporate    30250
Name: count, dtype: int64
['SEK' 'DKK' 'NOK' 'EUR']
currency
SEK    86918
NOK      465
DKK       87
EUR       19
Name: count, dtype: int64
['SurfaceLot' 'OnStreet' 'Administrative' 'UndergroundGarage'
 'AboveGroundGarage' 'CameraParkArea' 'EVC']
area_type
OnStreet             52747
SurfaceLot           26749
Administrative        6269
UndergroundGarage     1248
AboveGroundGarage      459
EVC                     14
CameraParkArea           3
Name: count, dtype: int64


In [9]:
# Check if "parkinguser_id" has multiple "account_type"
parkinguser_ids = {}
ok = 0
for index, row in df.iterrows():
    user_id = row['parkinguser_id']
    account_type = row['account_type']
    if user_id not in parkinguser_ids:
        parkinguser_ids[user_id] = account_type
    else:
        if parkinguser_ids[user_id] != account_type:
            print(f"User {user_id} has multiple account types: {parkinguser_ids[user_id]} and {account_type}")
        else:
            ok += 1

## Feature Engineering

In [10]:
def converter(x):
    if x['currency'] != "SEK":
        return x['parking_fee'] * constants.CURRENCY_TO_SEK[x['currency']]
    return x['parking_fee']
    

In [11]:
df['fee_equiv_sek'] = df[['currency', 'parking_fee']].apply(converter, axis=1)
print(df[df["currency"] == "EUR"].head())


            parking_id   area_type   parking_start_time     parking_end_time  parking_fee currency   parkinguser_id           car_id        lat        lon account_type  fee_equiv_sek
41920  fake_56f5699a58  SurfaceLot  2019-01-05 16:37:22  2019-01-05 17:30:00         2.00      EUR  fake_ade3f3f432  fake_e996210230  46.463216  12.204619      private        22.2200
49897  fake_10483f40af    OnStreet  2019-08-09 07:34:53  2019-08-09 07:35:01         0.00      EUR  fake_ade3f3f432  fake_e996210230  46.529172  12.137985      private         0.0000
50919  fake_d7e00b2a72    OnStreet  2017-07-08 17:54:45  2017-07-08 22:29:59        10.58      EUR  fake_c66fd0c84a  fake_4e7288eccf  52.507050  13.452954      private       117.5438
55830  fake_88e01a7992    OnStreet  2018-12-28 14:23:51  2018-12-28 15:20:28         0.83      EUR  fake_ade3f3f432  fake_e996210230  46.461713  12.201124      private         9.2213
58059  fake_6b57bf21c3    OnStreet  2017-07-26 06:55:40  2017-07-26 12:30:00        1

In [16]:
import datetime
def time_diff_convert(x: pd.Series) -> pd.Series:
    """Calculate the time difference between parking_end and parking_start. With the format date format "YYYY-MM-DD HH:MM:SS" """
    start_date = datetime.datetime.strptime(x['parking_start_time'], '%Y-%m-%d %H:%M:%S')
    end_date = datetime.datetime.strptime(x['parking_end_time'], '%Y-%m-%d %H:%M:%S')
    time_diff = end_date - start_date
    return time_diff

In [17]:
df["parking_duration"] = df[["parking_start_time", "parking_end_time"]].apply(time_diff_convert, axis=1)

In [None]:
df.head()