# Finish open rail analysis

In [1]:
import json
import os
import pandas as pd
import re
import seaborn as sns
import pyarrow.feather as feather

In [32]:
# Useful function
def underscore_lower_match(match):
    return '_'+match.group(1).lower()

def cURL(url, *options):
    '''Function will call curl on the system. 
    
    Parameters:
    url: <string> URL to be called by cURL
    *options: <string> optional arguments passed to cURL-call
    
    Returns: 
    os.system call (cURL command)
    '''    
    opt_string = str() #init string and concatenate all optinal arguments, seperator: whitespace
    for i, val in enumerate(options):
        opt_string += f'{val} '
    # Call cURL command on system using concatenated string and url-string
    return os.system(f'curl ' + opt_string + f'{url}')

def mod_file_extension(file, extension='.gz'):
    '''Function will replace the file's extension.
    
    Parameters:
    file: <string> file name together with potential directory path
    extension: <string> new file name extension (incl. dot) to be used. Default: ".gz"
    
    Returns:
    os.system call (mv file)
    '''
    split_string = file.split(sep='.') # splitting file name at ".", which exclusively
    output_file = split_string[0]+extension
    return os.system(f'mv {file} {output_file}')

def decompress_gz(file, extension='.json'):
    '''Function decompresses archive and returns file path.
    
    Parameters:
    file: <string> File to decompress
    extension: <string> File extension (incl. dot) for decompressed file. Default: '.json'
    
    Returns:
    <string> path and name of output file (composed of passed file and extension)
    '''
    split_string = file.split('.')
    output_file = split_string[0]+extension
    os.system(f'gzip -d -c {file} > {output_file}')
    return output_file

def delete_file(file):
    '''
    '''
    return os.system(f'rm {file}')

def tweak_wagons(data_raw):
    '''Function partially flattens input file (json format) and customizes columns, see comments
    
    Parameters:
    data_raw <json object> Data to process
    
    Returns: 
    <pandas DataFrame> DataFrame with one column per wagon, columns as specified below.
    '''
    df = (pd.json_normalize(data_raw, record_path=['journeySections', 'wagons'], # DataFrame with as many rows as "wagons"
        meta=['trainNumber', # Add train number to DataFrame
              'operatorShortCode', # Add short code of train operator to DataFrame
              'trainCategory', # Add train category (such that "IC") to DataFrame
              'trainType', # Add train type to DataFrame
              'departureDate', # Add departure date to DataFrame
              ['journeySection', 'totalLength'], # Add total length of train to DataFrame
              ['journeySection', 'maximumSpeed'], # Add total speed column
              ['journeySection', 'beginTimeTableRow', 'stationShortCode'], # Add station code of begin station
              ['journeySection', 'endTimeTableRow', 'stationShortCode']], # Add station code of end station
        sep='_') # use "_" to concatenate column names, when made up from several contributions (depth > 1)
         )
    if not ('luggage' in df.columns):
        df = df.assign(luggage=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    if not ('playground' in df.columns):
        df = df.assign(playground=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    if not ('video' in df.columns):
        df = df.assign(video=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    if not ('disabled' in df.columns):
        df = df.assign(disabled=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    if not ('catering' in df.columns):
        df = df.assign(catering=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    if not ('pet' in df.columns):
        df = df.assign(pet=pd.Series(index=df_wagons.index, dtype='bool', data=False))
    return (df
        .rename(axis='columns', mapper=lambda s: s.split('_')[-1]) # keep only clumn description of deepest level
        # split all column names at capital letters and concatenate lower cased parts with "_"        
        .rename(axis='columns', mapper=lambda s: re.sub(r'([A-Z])', underscore_lower_match, s).lower())
        .rename(axis='columns', mapper={'length': 'length_cm'})
        .assign(departure_date=lambda s: s.departure_date.astype('datetime64'), # Departure Date as Datetime type
            # Assign data type given in documentation on rata.traffic.fi or appropriate one 
            train_number=lambda s: s.train_number.astype('int64'),
            operator_short_code=lambda s: s.operator_short_code.astype('category'),
            train_category=lambda s: s.train_category.astype('category'),
            train_type=lambda s: s.train_type.astype('category'),
            total_length=lambda s: s.total_length.astype('int32'),
            maximum_speed=lambda s: s.maximum_speed.astype('int32'),
            # For following bool columns, missing value means "False"
            playground=lambda s: s.playground.fillna(value=False),
            video=lambda s: s.video.fillna(value=False),
            disabled=lambda s: s.disabled.fillna(value=False), 
            catering=lambda s: s.catering.fillna(value=False),
            pet=lambda s: s.pet.fillna(value=False),
            luggage=lambda s: s.luggage.fillna(value=False),
            # Fill missing values of wagon type with "unknow" > 40% missing values, > 20 diff. wagons
            wagon_type=lambda s: s.wagon_type.fillna(value='unknown')
                .astype('category'))
        .astype({'location': 'int32',
                 'sales_number': 'int32',
                 'length_cm': 'int32'})
)

def wagon_list(json_file):
    '''Function takes one file, opens it and returns pandas DataFrame as specified in "tweak_wagons"
    
    Parameters:
    json_file: <string> File path together with file name. File to process
    
    Returns:
    <pandas DataFrame> DataFrame as specified in function "tweak_wagons" 
    '''
    with open(json_file) as data:
        return tweak_wagons(json.load(data))
    
def collect_compositions_of_day(date='2015-12-11', 
                                working_dir='/home/felbus/ml_for_physicists/temp/'):
    '''Function collects composition information from API of "rata.digitraffic.fi" of specific day
    
    Parameters:
    date <string> Date to collect data of "yyyy-mm-dd"
    working_dir <string> Directory to temporarly store downloaded data in
    '''
    # compose url and file name information for call of cURL 
    url_compositions = 'https://rata.digitraffic.fi/api/v1/compositions/'
    url = url_compositions + date
    file_name = 'temp.gz'
    data_dir = working_dir + file_name
    # Call cURL
    cURL(url, '-H \'Accept-Encoding: gzip\'', '-o', data_dir)
    # Downloaded file is an archive, thus decompress it
    file_decompressed = decompress_gz(data_dir)
    # Turn data into DataFrame, that lists every wagon one time
    df = wagon_list(file_decompressed)
    # Delete temporary files
    delete_file(data_dir)
    delete_file(file_decompressed)
    # Return DataFrame holding every wagon
    return df

def dates_between(date_begin, date_end):
    '''Generator function that returns strings of dates.
    
    Parameters: 
    date_begin <string> First date in generator "yyyy-mm-dd"
    date_end <string> Date to end generator, caution this date is exclusive! "yyyy-mm-dd"
    
    Returns:
    Generator providing dates as strings "yyyy-mm-dd"
    '''
    date = pd.to_datetime(date_begin)
    date_stop = pd.to_datetime(date_end)
    while date < date_stop:
        yield str(date.year)+'-'+f'{date.month:02d}'+'-'+f'{date.day:02d}'
        date += pd.Timedelta(days=1)

def wagon_list_to_total_length(wagons, groupby='train_category'):
    '''Function transforms DataFrame by grouping by "departure date" and aggregate. Returned DataFrame will
    hold cummulative length of wagons in meters.
    
    Parameters:
    wagons <pandas DataFrame> DataFrame that holds at least columns "departure_date", "length_cm" and
    aggregate
    aggregate <string> Column of "wagons" to groupby
    
    Returns:
    <pandas DataFrame> index = dates, columns = groupby of groupby-parameter
    '''
    return (wagons.assign(length_cm=lambda s: s.length_cm/100)
            .groupby(['departure_date', groupby]).length_cm.sum().unstack())

def collect_total_length(date_begin, date_end):
    '''Function collects information about total length of train compositions for every date specified.
    
    Parameters:
    date_begin <string> First date to collect data for "yyyy-mm-dd"
    date_end <string> Last date (exclusive) to collect data for "yyyy-mm-dd"
    
    Returns:
    <pandas DataFrame> Index: dates, Columns: train_categories, Values: total length [m]
    '''
    df = pd.DataFrame()
    for dates in dates_between(date_begin, date_end):
        df_wagons = collect_compositions_of_day(date=dates)
        df = pd.concat([df, wagon_list_to_total_length(df_wagons)])
    df = (df.rename(axis='columns', mapper=lambda s: s.lower())
            .rename(axis='columns', mapper=lambda s:s.replace('-', '_')))
    return df

def collect_wagon_list_period(date_begin, date_end):
    '''Function collects information about train compositions for every date specified.
    
    Parameters:
    date_begin <string> First date to collect data for "yyyy-mm-dd"
    date_end <string> Last date (exclusive) to collect data for "yyyy-mm-dd"
    
    Returns:
    <pandas DataFrame> Index: dates, Columns: train_categories, Values: total length [m]
    '''
    df = pd.DataFrame()
    for dates in dates_between(date_begin, date_end):
        df_wagons = collect_compositions_of_day(date=dates)
        df = pd.concat([df, df_wagons], ignore_index=True)
    return df

def save_to_feather(df, file):
    '''Function saves DataFrame to feather file, resetting index first. Avoids problem with DateTime-Index.
    
    Parameters:
    df <pandas DataFrame> DataFrame to save
    file <string> File to save to
    
    Return:
    None
    '''
    (df
         .reset_index()
         .to_feather(file)
    )
    return None
    
def load_from_feather(file):
    '''Function laods pandas DataFrame from feather file. Expects column "departure_date" and will set it
    as index.
    
    Parameters:
    file <string> file (path+name) to load from
    
    Returns:
    <pandas DataFrame> with column "departure_date" set as index
    '''
    df = pd.read_feather(file).set_index('departure_date')
    return df

In [47]:
# Load all train compositions of a specific day

# Define URL for cURL GET 
url_compositions = 'https://rata.digitraffic.fi/api/v1/compositions/'
date = '2015-12-25'
# Define directory and file name to store received data
data_directory = '/home/felbus/ml_for_physicists/temp/'
file_name = 'test.gz'

# Send cURL GET and decompressed received data.
cURL(url_compositions+date, '-H \'Accept-Encoding: gzip\'', '-o', data_directory+file_name)
file_decompressed = decompress_gz(data_directory+file_name)

# Turn data into DataFrame, that lists every wagon one time.
with open(file_decompressed) as data:
    df_wagons1 = tweak_wagons(json.load(data))

df_wagons1.head(2)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16571    0 16571    0     0  57380      0 --:--:-- --:--:-- --:--:-- 57538


Unnamed: 0,wagon_type,location,sales_number,length_cm,pet,playground,video,disabled,catering,luggage,train_number,operator_short_code,train_category,train_type,departure_date,total_length,maximum_speed,station_short_code,station_short_code.1
0,Edb,1,5,2640,True,False,False,False,False,False,5,vr,Long-distance,IC,2015-12-25,152,200,HKI,JNS
1,Edfs,2,4,2640,False,True,True,True,False,False,5,vr,Long-distance,IC,2015-12-25,152,200,HKI,JNS


In [46]:
# Define URL for cURL GET 
url_compositions = 'https://rata.digitraffic.fi/api/v1/compositions/'
date = '2015-12-26'
# Define directory and file name to store received data
data_directory = '/home/felbus/ml_for_physicists/temp/'
file_name = 'test.gz'

# Send cURL GET and decompressed received data.
cURL(url_compositions+date, '-H \'Accept-Encoding: gzip\'', '-o', data_directory+file_name)
file_decompressed = decompress_gz(data_directory+file_name)

# Turn data into DataFrame, that lists every wagon one time.
with open(file_decompressed) as data:
    df_wagons2 = tweak_wagons(json.load(data))

df_wagons2.head(2)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 22179    0 22179    0     0  74099      0 --:--:-- --:--:-- --:--:-- 73930


Unnamed: 0,wagon_type,location,sales_number,length_cm,playground,video,disabled,catering,pet,luggage,train_number,operator_short_code,train_category,train_type,departure_date,total_length,maximum_speed,station_short_code,station_short_code.1
0,Ed,1,6,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2015-12-26,179,200,HKI,JNS
1,Ed,2,5,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2015-12-26,179,200,HKI,JNS


In [44]:
pd.concat([df_wagons1, df_wagons2])

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [36]:
df_wagon_list = collect_wagon_list_period('2015-12-25', '2015-12-27')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16571    0 16571    0     0  56747      0 --:--:-- --:--:-- --:--:-- 56945
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 22179    0 22179    0     0  76784      0 --:--:-- --:--:-- --:--:-- 77010


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [122]:
save_to_feather(df_wagons, '/home/felbus/ml_for_physicists/data/test.fea')
#load_from_feather('/home/felbus/ml_for_physicists/data/test.fea')

In [35]:
df_wagon_list

Unnamed: 0,wagon_type,location,sales_number,length_cm,playground,video,disabled,catering,pet,luggage,train_number,operator_short_code,train_category,train_type,departure_date,total_length,maximum_speed,station_short_code,station_short_code.1
0,Ed,1,5,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2015-12-15,152,200,HKI,JNS
1,Edfs,2,4,2640,True,True,True,False,False,False,1,vr,Long-distance,IC,2015-12-15,152,200,HKI,JNS
2,ERd,3,3,2640,False,False,False,True,False,False,1,vr,Long-distance,IC,2015-12-15,152,200,HKI,JNS
3,Edb,4,2,2640,False,False,False,False,True,False,1,vr,Long-distance,IC,2015-12-15,152,200,HKI,JNS
4,Edo,5,1,2740,False,True,False,False,True,False,1,vr,Long-distance,IC,2015-12-15,152,200,HKI,JNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237,M,1,0,5440,False,False,False,False,False,False,10688,vr,Commuter,HV,2015-12-16,54,160,KV,RI
5238,M,1,0,5440,False,False,False,False,False,False,10693,vr,Commuter,HV,2015-12-16,54,120,RI,HL
5239,M,1,0,5440,False,False,False,False,False,False,10694,vr,Commuter,HV,2015-12-16,54,120,HL,RI
5240,unknown,1,0,7520,False,False,False,False,False,False,11661,vr,Commuter,HV,2015-12-16,75,160,ILR,KE


In [88]:
col = df_debug2.columns
df_debug.assign(**{s: df_debug[s] if s in col else pd.Series() for s in col})

KeyError: 'luggage'

In [87]:
df_debug2.columns

Index(['wagon_type', 'location', 'sales_number', 'length_cm', 'playground',
       'video', 'disabled', 'catering', 'pet', 'luggage', 'train_number',
       'operator_short_code', 'train_category', 'train_type', 'departure_date',
       'total_length', 'maximum_speed', 'station_short_code',
       'station_short_code'],
      dtype='object')

## Exploratory data analysis

In [22]:
# Any missing values left
print(f'Any missing data in DataFrame? {df_wagons.isna().any().any()}')

Any missing data in DataFrame? False


In [92]:
wagon_list_to_total_length(df_wagons)

train_category,Commuter,Long-distance
departure_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-12,94772.0,43830.54


In [27]:
df_wagons.head(10)

Unnamed: 0,wagon_type,location,sales_number,length_cm,playground,video,disabled,catering,pet,luggage,train_number,operator_short_code,train_category,train_type,departure_date,total_length,maximum_speed,station_short_code,station_short_code.1
0,Ed,1,6,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
1,Ed,2,5,2640,False,False,False,False,False,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
2,Eds,3,4,2640,True,True,True,False,False,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
3,ERd,4,3,2640,False,False,False,True,False,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
4,Edb,5,2,2640,False,False,False,False,True,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
5,Edo,6,1,2740,False,True,False,False,True,False,1,vr,Long-distance,IC,2019-04-12,179,200,HKI,JNS
6,Sm3,1,1,2814,False,False,False,False,False,False,2,vr,Long-distance,S,2019-04-12,160,220,JNS,HKI
7,CMH,2,2,2590,False,False,True,False,False,False,2,vr,Long-distance,S,2019-04-12,160,220,JNS,HKI
8,TTC,3,3,2590,False,False,False,True,False,False,2,vr,Long-distance,S,2019-04-12,160,220,JNS,HKI
9,TT,4,4,2590,False,False,False,False,False,False,2,vr,Long-distance,S,2019-04-12,160,220,JNS,HKI


In [45]:
df_wagons.groupby(['departure_date', 'train_category']).length_cm.sum().unstack()

train_category,Commuter,Long-distance
departure_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-12,9477200,4383054


In [40]:
df_wagons.groupby(['departure_date', 'train_category']).length_cm.sum().unstack().reset_index().to_feather('test.fea')

In [56]:
date = pd.to_datetime('2015-12-12')
print(date, '\n')
date_next = date + pd.Timedelta(days=1)
print(date_next)


2015-12-12 00:00:00 

2015-12-13 00:00:00


In [59]:
def infinite_sequence():
    date = pd.to_datetime('2015-12-11')
    num = 0
    while num < 10:
        yield date
        num += 1
        date 

for i in infinite_sequence():
    print(i)

2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
2015-12-11 00:00:00
