# Finish open rail analysis

In [2]:
import json
import os
import pandas as pd
import re

In [80]:
# Useful function
def underscore_lower_match(match):
    return '_'+match.group(1).lower()

def cURL(url, *options):
    '''Function will call curl on the system. 
    
    Parameters:
    url: <string> URL to be called by cURL
    *options: <string> optional arguments passed to cURL-call
    
    Returns: 
    os.system call (cURL command)
    '''    
    opt_string = str() #init string and concatenate all optinal arguments, seperator: whitespace
    for i, val in enumerate(options):
        opt_string += f'{val} '
    # Call cURL command on system using concatenated string and url-string
    return os.system(f'curl ' + opt_string + f'{url}')

def mod_file_extension(file, extension='.gz'):
    '''Function will replace the file's extension.
    
    Parameters:
    file: <string> file name together with potential directory path
    extension: <string> new file name extension (incl. dot) to be used. Default: ".gz"
    
    Returns:
    os.system call (mv file)
    '''
    split_string = file.split(sep='.') # splitting file name at ".", which exclusively
    output_file = split_string[0]+extension
    return os.system(f'mv {file} {output_file}')

def decompress_gz(file, extension='.json'):
    '''Function decompresses archive and returns file path.
    
    Parameters:
    file: <string> File to decompress
    extension: <string> File extension (incl. dot) for decompressed file. Default: '.json'
    
    Returns:
    <string> path and name of output file (composed of passed file and extension)
    '''
    split_string = file.split('.')
    output_file = split_string[0]+extension
    os.system(f'gzip -d -c {file} > {output_file}')
    return output_file

def tweak_wagons(data_raw):
    return (pd.json_normalize(data_raw, record_path=['journeySections', 'wagons'], # DataFrame with as many rows as "wagons"
        meta=['trainNumber', # Add train number to DataFrame
              'operatorShortCode', # Add short code of train operator to DataFrame
              'trainCategory', # Add train category (such that "IC") to DataFrame
              'trainType', # Add train type to DataFrame
              'departureDate',
              ['journeySection', 'totalLength'], # Add total length of train to DataFrame
              ['journeySection', 'maximumSpeed'], # Add total speed column
              ['journeySection', 'beginTimeTableRow', 'stationShortCode'], # Add station code of begin station
              ['journeySection', 'endTimeTableRow', 'stationShortCode']], # Add station code of end station
        sep='_' # use "_" to concatenate column names, when made up from several contributions (depth > 1)
    ).rename(axis='columns', mapper=lambda s: s.split('_')[-1]) # keep only clumn description of deepest level
    # split all column names at capital letters and concatenate lower cased parts with "_"        
    .rename(axis='columns', mapper=lambda s: re.sub(r'([A-Z])', underscore_lower_match, s).lower())
    .rename(axis='columns', mapper={'length': 'length_cm'})
    .assign(departure_date=lambda s: s.departure_date.astype('datetime64'), # Departure Date as Datetime type
        # Assign data type given in documentation on rata.traffic.fi or appropriate one 
        train_number=lambda s: s.train_number.astype('int64'),
        operator_short_code=lambda s: s.operator_short_code.astype('category'),
        train_category=lambda s: s.train_category.astype('category'),
        train_type=lambda s: s.train_type.astype('category'),
        total_length=lambda s: s.total_length.astype('int32'),
        maximum_speed=lambda s: s.maximum_speed.astype('int32'),
        # For following bool columns, missing value means "False"
        playground=lambda s: s.playground.fillna(value=False),
        video=lambda s: s.video.fillna(value=False),
        disabled=lambda s: s.disabled.fillna(value=False), 
        catering=lambda s: s.catering.fillna(value=False), 
        pet=lambda s: s.pet.fillna(value=False), 
        luggage=lambda s: s.luggage.fillna(value=False),
        # Fill missing values of wagon type with "unknow" > 40% missing values, > 20 diff. wagons
        wagon_type=lambda s: s.wagon_type.fillna(value='unknown')
            .astype('category'))
    .astype({'location': 'int32',
            'sales_number': 'int32',
            'length_mm': 'int32'})
)

## Loading and cleaning some data about train compositions

In [87]:
# Load all train compositions of a specific day

# Define URL for cURL GET 
url_compositions = 'https://rata.digitraffic.fi/api/v1/compositions/2015-12-12'
# Define directory and file name to store received data
data_directory = '/home/felbus/ml_for_physicists/open_rail/Fin_Rail_analysis/'
file_name = 'test_day.gz'

# Send cURL GET and decompressed received data.
cURL(url_compositions, '-H \'Accept-Encoding: gzip\'', '-o', data_directory+file_name)
file_decompressed = decompress_gz(data_directory+file_name)

# Turn data into DataFrame, that lists every wagon one time.
with open(file_decompressed) as data:
    df_wagons = tweak_wagons(json.load(data))

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25448    0 25448    0     0  73441      0 --:--:-- --:--:-- --:--:-- 73549


## Exploratory data analysis

In [88]:
# Any missing values left
print(f'Any missing data in DataFrame? {df_wagons.isna().any().any()}')

Any missing data in DataFrame? False


In [94]:
df_wagons.train_category.value_counts()

Long-distance    1193
Commuter          642
Name: train_category, dtype: int64

In [91]:
df_wagons

Unnamed: 0,wagon_type,location,sales_number,length_mm,playground,video,disabled,catering,pet,luggage,train_number,operator_short_code,train_category,train_type,departure_date,total_length,maximum_speed,station_short_code,station_short_code.1
0,Ed,1,5,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2015-12-12,152,200,HKI,JNS
1,Edfs,2,4,2640,True,True,True,False,False,False,1,vr,Long-distance,IC,2015-12-12,152,200,HKI,JNS
2,ERd,3,3,2640,False,False,False,True,False,False,1,vr,Long-distance,IC,2015-12-12,152,200,HKI,JNS
3,Edb,4,2,2640,False,False,False,False,True,False,1,vr,Long-distance,IC,2015-12-12,152,200,HKI,JNS
4,Edo,5,1,2740,False,True,False,False,True,False,1,vr,Long-distance,IC,2015-12-12,152,200,HKI,JNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1830,unknown,2,0,5300,False,False,False,False,False,False,65013,vr,Commuter,HV,2015-12-12,106,120,HKI,ILR
1831,M,1,0,5440,False,False,False,False,False,False,65014,vr,Commuter,HV,2015-12-12,54,120,ILR,HKI
1832,M,1,0,5440,False,False,False,False,False,False,65015,vr,Commuter,HV,2015-12-12,54,120,HKI,ILR
1833,M,1,0,5440,False,False,False,False,False,False,65016,vr,Commuter,HV,2015-12-12,54,120,ILR,HKI
