# 03_transformations

transform and aggregate the data for the use cases

## Ideas:
* PCA on the high correlated static data for reducing dimensionality
* create new features for geolocations

* aggregating on geolocations + timeframes
* aggregating on ships?

* merging static and position data


## import packages

In [1]:
import sys
import os

import yaml
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

from functions.utils import get_folder
from functions.data_loads import get_columns, load_data
from functions.data_cleaning import SimpleOutlierMask

pd.set_option('display.float_format', '{:.2f}'.format)

curr_folder = get_folder()
parent_fold = curr_folder.parent
with open(parent_fold / "connections.yml", encoding='utf-8') as file:
    db_config = yaml.safe_load(file)
    
with open(parent_fold / "configurations" / "data_loads.yaml", encoding='utf-8') as file:
    column_conf = yaml.safe_load(file)
    
engine = create_engine(f'postgresql+psycopg2://{db_config["user"]}:{db_config["password"]}@{db_config["host"]}:{db_config["port"]}/{db_config["db"]}')

dates = ["01.05.2022","05.05.2022"]

## load the data from the data bases and clean the data

In [2]:
position_columns = get_columns(engine,'tbl_positionnew',column_conf['position_unrel_cols'])

data = load_data(engine,'tbl_positionnew',position_columns)

SOM = SimpleOutlierMask()
# auskommentieren wenn nicht komplett neu trainiert werden soll
SOM.fit(data['speedoverground'])
#SOM.load(parent_fold / "modelle" / "simple_outlier_mask.pckl")
data['sog'] = SOM.transform(data['speedoverground'])
SOM.save(parent_fold / "modelle" / "simple_outlier_mask.pckl")
data.drop(['speedoverground'],axis = 1, inplace=True)
data.head()

Unnamed: 0,aisclass,courseoverground,latitude,longitude,messagetype,mmsi,msgtime,navigationalstatus,rateofturn,trueheading,sog
0,A,279.0,60.15,5.39,1,257072700,2022-04-07 17:00:00,0,0.0,226.0,0.0
1,A,0.0,58.15,8.03,1,258012180,2022-04-07 17:00:00,15,0.0,0.0,0.2
2,A,249.0,65.99,12.26,3,258489000,2022-04-07 17:00:00,5,0.0,206.0,0.0
3,A,179.5,68.36,16.01,3,257837000,2022-04-07 16:50:00,0,-5.0,174.0,13.9
4,A,206.5,63.22,4.32,1,311000548,2022-04-07 17:00:00,0,11.0,205.0,14.9


## perform the transformations
Group stage ist:
latitude, longitude, time

In [4]:
data['lat_lon'] = data['latitude'].round(decimals=1).astype(str) + ',' + data['longitude'].round(decimals=1).astype(str)

## transform data features:
* count number of unique ships(mmsi) in area
* avg speed in area
* avg rateofturn
* avg trueheading?
* avg courseoverground


In [5]:
data_grouped = data.groupby(['lat_lon','msgtime']).agg({'mmsi':pd.Series.nunique,'aisclass':'count',
                                                        'sog':['mean','min','max'],
                                                        'trueheading':['mean','min','max'],
                                                        'rateofturn':['mean','min','max'],
                                                        'courseoverground':['mean','min','max'],
                                                       })
data_grouped.columns = data_grouped.columns.get_level_values(0) + '_' +  data_grouped.columns.get_level_values(1)
data_grouped.reset_index(inplace=True)
print("new DF has Shape: ",data_grouped.shape, " Old Df has Shape:", data.shape)
data_grouped.head()

new DF has Shape:  (6206947, 16)


Unnamed: 0,lat_lon,msgtime,mmsi_nunique,aisclass_count,sog_mean,sog_min,sog_max,trueheading_mean,trueheading_min,trueheading_max,rateofturn_mean,rateofturn_min,rateofturn_max,courseoverground_mean,courseoverground_min,courseoverground_max
0,"56.2,3.1",2022-06-06 05:10:00,1,1,8.8,8.8,8.8,9.0,9.0,9.0,0.0,0.0,0.0,4.3,4.3,4.3
1,"56.2,3.1",2022-06-06 05:20:00,1,1,8.7,8.7,8.7,8.0,8.0,8.0,0.0,0.0,0.0,4.7,4.7,4.7
2,"56.2,3.2",2022-04-17 06:50:00,1,1,12.6,12.6,12.6,303.5,303.5,303.5,0.0,0.0,0.0,308.1,308.1,308.1
3,"56.2,3.3",2022-04-17 06:40:00,1,1,12.5,12.5,12.5,305.0,305.0,305.0,0.0,0.0,0.0,309.5,309.5,309.5
4,"56.2,3.3",2022-04-17 06:50:00,1,1,12.5,12.5,12.5,304.0,304.0,304.0,0.0,0.0,0.0,308.3,308.3,308.3


In [9]:
print("new DF has Shape: ",data_grouped.shape, " Old Df has Shape:", data.shape)

new DF has Shape:  (6206947, 16)  Old Df has Shape: (19372876, 12)


In [11]:
data_grouped.to_sql("tbl_positionagg",engine,if_exists='replace', index=False)

947