In [1]:
import numpy as np
import pandas as pd

In [2]:
gps_raw = pd.read_csv('gps_raw_data.csv', encoding = 'ISO 8859-16', delimiter=';', index_col=[0])
gps_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21091 entries, 1 to 21091
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Zdarzenie                           21091 non-null  object 
 1   Data / godzina                      21091 non-null  object 
 2   Pojazd                              21091 non-null  object 
 3   Kierowca                            21091 non-null  object 
 4   Pozycja                             21091 non-null  object 
 5   Odległoæ do nastêpnego punktu POI  21091 non-null  object 
 6   Prêdkoæ (km/h)                     21091 non-null  object 
 7   Przebieg (km)                       1132 non-null   float64
 8   Status zapłonu                      21091 non-null  object 
 9   Status silnika                      21091 non-null  object 
 10  Długoæ geogr.                      21091 non-null  float64
 11  Szerokoæ geogr.                    21091

In [3]:
gps_raw

Unnamed: 0,Zdarzenie,Data / godzina,Pojazd,Kierowca,Pozycja,Odległoæ do nastêpnego punktu POI,Prêdkoæ (km/h),Przebieg (km),Status zapłonu,Status silnika,Długoæ geogr.,Szerokoæ geogr.
1,Pozycja,25.06.2019 01:43,PL12345,"Nowak, Jan","Południe, 0,6 m, Wolmirstedter Straße, 39326 H...","AK Magdeburg (D-39116) Zachód 4,0 km",0,27533.0,włčczony,wyłčczony,11.490252,52.179932
2,Pozycja delta,25.06.2019 01:43,PL12345,Nieznany kierowca,"Południe, 0,6 m, Wolmirstedter Straße, 39326 H...",-,-,,Nieznany,Nieznany,11.490252,52.179932
3,Pozycja,25.06.2019 01:53,PL12345,Nieznany kierowca,"Południowy Zachód, 0,5 m, Wolmirstedter Straße...","AK Magdeburg (D-39116) Zachód 4,0 km",0,27533.0,włčczony,wyłčczony,11.490252,52.179928
4,Pozycja delta,25.06.2019 01:53,PL12345,Nieznany kierowca,"Południowy Zachód, 0,5 m, Wolmirstedter Straße...",-,-,,Nieznany,Nieznany,11.490252,52.179928
5,Pozycja,25.06.2019 02:03,PL12345,"Nowak, Jan","Południowy Zachód, 0,5 m, Wolmirstedter Straße...","AK Magdeburg (D-39116) Zachód 4,0 km",0,27533.0,włčczony,wyłčczony,11.490252,52.179928
...,...,...,...,...,...,...,...,...,...,...,...,...
21087,Pozycja delta,24.07.2019 20:57,PL12345,Nieznany kierowca,"Południowy Wschód, 1,8 m, Sandkampstraße, 4843...",-,-,,Nieznany,Nieznany,7.455863,52.300652
21088,Pozycja delta,24.07.2019 20:58,PL12345,Nieznany kierowca,"Zachód, 9,4 m, Sandkampstraße 78, 48432 Rheine...",-,-,,Nieznany,Nieznany,7.454702,52.303913
21089,Pozycja delta,24.07.2019 20:58,PL12345,Nieznany kierowca,"Wschód, 20,9 m, Sandkampstraße 127, 48432 Rhei...",-,-,,Nieznany,Nieznany,7.452255,52.306499
21090,Pozycja,24.07.2019 20:59,PL12345,"Nowak, Jan","Północ, 12,3 m, Bonifatiusstraße, 48432 Rheine...","Steinfurt (D-48565) Północny Wschód 19,0 km",15,39831.1,włčczony,włčczony,7.450903,52.306171


In [4]:
gps_raw.columns

Index(['Zdarzenie', 'Data / godzina', 'Pojazd', 'Kierowca', 'Pozycja',
       'Odległoæ do nastêpnego punktu POI', 'Prêdkoæ (km/h)',
       'Przebieg (km)', 'Status zapłonu', 'Status silnika', 'Długoæ geogr.',
       'Szerokoæ geogr.'],
      dtype='object')

In [5]:
# drop unnecessary columns
gps_data = gps_raw.drop(columns=['Zdarzenie', 'Odległoæ do nastêpnego punktu POI'])

In [8]:
# change column names
gps_data.columns = ['dt', 'vehicle', 'driver', 'position', 'speed', 'mileage', 'ignition_status', 'engine_status', 
                    'longitude', 'latitude']

In [7]:
# Tru/False mask
mask = gps_data['driver'] == 'Nieznany kierowca'

# data filtering
gps_data = gps_data[~mask]

In [9]:
print (gps_data['ignition_status'].value_counts(), gps_data['engine_status'].value_counts())

włčczony    1131
Nieznany      53
Name: ignition_status, dtype: int64 włčczony     1094
Nieznany       53
wyłčczony      37
Name: engine_status, dtype: int64


In [10]:
# change values to bool
gps_data['ignition_status'] = gps_data.apply (lambda x: True if x['ignition_status'] == 'włčczony' else False, 1)
gps_data['engine_status'] = gps_data.apply (lambda x: True if x['engine_status'] == 'włčczony' else False, 1)

In [11]:
# cutting country shortcut
country_col = gps_data['position'].apply(lambda x: x[-3:])

In [12]:
# cleaning 'position' column
position = gps_data['position'].str.split('m,', expand=True)[1]
position_col = position.apply(lambda x: x[:-5])

In [13]:
# dropping old 'position' column
gps_data.drop(columns=['position'], inplace=True)

In [14]:
# insert new columns 'position', 'country'
gps_data.insert(3, column='position', value=position_col)
gps_data.insert(4, column='country', value=country_col)

In [15]:
# convert 'dt' to datetime
gps_data['dt'] = pd.to_datetime(gps_data['dt'], dayfirst=True)

# convert to int
gps_data['speed'] = gps_data['speed'].apply(lambda x: np.NaN if x=='-' or x=='0' else x)
gps_data['speed'] = gps_data['speed'].astype('float')

# convert to category
gps_data[['position', 'country', 'vehicle', 'driver']] = gps_data[['position', 'country', 'vehicle', 'driver']].astype('category')

In [16]:
gps_data

Unnamed: 0,dt,vehicle,driver,position,country,speed,mileage,ignition_status,engine_status,longitude,latitude
1,2019-06-25 01:43:00,PL12345,"Nowak, Jan","Wolmirstedter Straße, 39326 Hohe Börde",DEU,,27533.0,True,False,11.490252,52.179932
5,2019-06-25 02:03:00,PL12345,"Nowak, Jan","Wolmirstedter Straße, 39326 Hohe Börde",DEU,,27533.0,True,False,11.490252,52.179928
7,2019-06-25 02:13:00,PL12345,"Nowak, Jan","Wolmirstedter Straße, 39326 Hohe Börde",DEU,,27533.0,True,False,11.490252,52.179928
9,2019-06-25 02:23:00,PL12345,"Nowak, Jan","Wolmirstedter Straße, 39326 Hohe Börde",DEU,,27533.0,True,False,11.490252,52.179932
11,2019-06-25 11:02:00,PL12345,"Nowak, Jan","Wolmirstedter Straße, 39326 Hohe Börde",DEU,,27533.0,True,True,11.490245,52.179932
...,...,...,...,...,...,...,...,...,...,...,...
21031,2019-07-24 20:29:00,PL12345,"Nowak, Jan","A30, 49492 Westerkappeln",DEU,84.0,39797.8,True,True,7.879366,52.263775
21052,2019-07-24 20:39:00,PL12345,"Nowak, Jan","A30, 49479 Ibbenbüren",DEU,84.0,39812.0,True,True,7.680992,52.264351
21073,2019-07-24 20:49:00,PL12345,"Nowak, Jan","A30, 48432 Rheine",DEU,70.0,39826.2,True,True,7.491128,52.303776
21090,2019-07-24 20:59:00,PL12345,"Nowak, Jan","Bonifatiusstraße, 48432 Rheine",DEU,15.0,39831.1,True,True,7.450903,52.306171


In [17]:
gps_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1184 entries, 1 to 21091
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   dt               1184 non-null   datetime64[ns]
 1   vehicle          1184 non-null   category      
 2   driver           1184 non-null   category      
 3   position         1184 non-null   category      
 4   country          1184 non-null   category      
 5   speed            1002 non-null   float64       
 6   mileage          1131 non-null   float64       
 7   ignition_status  1184 non-null   bool          
 8   engine_status    1184 non-null   bool          
 9   longitude        1184 non-null   float64       
 10  latitude         1184 non-null   float64       
dtypes: bool(2), category(4), datetime64[ns](1), float64(4)
memory usage: 85.6 KB


In [18]:
gps_data.reset_index(drop=True, inplace=True)
gps_data.to_pickle('gps_data.pickle')
gps_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   dt               1184 non-null   datetime64[ns]
 1   vehicle          1184 non-null   category      
 2   driver           1184 non-null   category      
 3   position         1184 non-null   category      
 4   country          1184 non-null   category      
 5   speed            1002 non-null   float64       
 6   mileage          1131 non-null   float64       
 7   ignition_status  1184 non-null   bool          
 8   engine_status    1184 non-null   bool          
 9   longitude        1184 non-null   float64       
 10  latitude         1184 non-null   float64       
dtypes: bool(2), category(4), datetime64[ns](1), float64(4)
memory usage: 76.4 KB
