In [52]:
import json
import os
import pandas as pd
import re

In [106]:
# Useful function
def underscore_lower_match(match):
    return '_'+match.group(1).lower()

In [2]:
url_compositions = 'https://rata.digitraffic.fi/api/v1/compositions'
data_directory = '/home/felbus/ml_for_physicists/open_rail/Fin_Rail_analysis/'
file_name = 'test.gz'

def cURL(url, *options):
    '''Function will call curl on the system. 
    
    Parameters:
    url: <string> URL to be called by cURL
    *options: <string> optional arguments passed to cURL-call
    
    Returns: 
    os.system call (cURL command)
    '''    
    opt_string = str() #init string and concatenate all optinal arguments, seperator: whitespace
    for i, val in enumerate(options):
        opt_string += f'{val} '
    # Call cURL command on system using concatenated string and url-string
    return os.system(f'curl ' + opt_string + f'{url}')

def mod_file_extension(file, extension='.gz'):
    '''Function will replace the file's extension.
    
    Parameters:
    file: <string> file name together with potential directory path
    extension: <string> new file name extension (incl. dot) to be used. Default: ".gz"
    
    Returns:
    os.system call (mv file)
    '''
    split_string = file.split(sep='.') # splitting file name at ".", which exclusively
    output_file = split_string[0]+extension
    return os.system(f'mv {file} {output_file}')

def decompress_gz(file, extension='.json'):
    '''Function decompresses archive and returns file path.
    
    Parameters:
    file: <string> File to decompress
    extension: <string> File extension (incl. dot) for decompressed file. Default: '.json'
    
    Returns:
    <string> path and name of output file (composed of passed file and extension)
    '''
    split_string = file.split('.')
    output_file = split_string[0]+extension
    os.system(f'gzip -d -c {file} > {output_file}')
    return output_file


In [176]:
cURL(url_compositions, '-H \'Accept-Encoding: gzip\'', '-o', data_directory+file_name)
file_decompressed = decompress_gz(data_directory+file_name)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0

In [175]:
with open(file_decompressed) as data:
    data_new = tweak_wagons(json.load(data))
    
data_new

Unnamed: 0,wagon_type,location,sales_number,length_mm,pet,disabled,train_number,operator_short_code,train_category,train_type,departure_date,total_length,station_short_code,station_short_code.1
0,Dm12,1,1,2520,True,True,762,vr,Long-distance,HDM,2023-08-28,25,NRM,JNS


In [173]:
df = pd.concat([df, data_new], axis=0)
df

Unnamed: 0,wagon_type,location,sales_number,length_mm,pet,disabled,train_number,operator_short_code,train_category,train_type,departure_date,total_length,station_short_code,station_short_code.1
0,Dm12,1,1,2520,True,True,426,vr,Long-distance,HDM,2023-08-28,25,KEU,HPK
1,Dm12,1,1,2520,True,True,426,vr,Long-distance,HDM,2023-08-28,25,HPK,TPE
0,Dm12,1,1,2520,True,True,762,vr,Long-distance,HDM,2023-08-28,25,NRM,JNS
0,Dm12,1,1,2520,True,True,762,vr,Long-distance,HDM,2023-08-28,25,NRM,JNS
0,Dm12,1,1,2520,True,True,762,vr,Long-distance,HDM,2023-08-28,25,NRM,JNS


In [170]:
def tweak_wagons(data_raw):
    return (pd.json_normalize(data_raw, record_path=['journeySections', 'wagons'], # DataFrame with as many rows as "wagons"
        meta=['trainNumber', # Add train number to DataFrame
              'operatorShortCode', # Add short code of train operator to DataFrame
              'trainCategory', # Add train category (such that "IC") to DataFrame
              'trainType', # Add train type to DataFrame
              'departureDate',
              ['journeySection', 'totalLength'], # Add total length of train to DataFrame
              ['journeySection', 'beginTimeTableRow', 'stationShortCode'], # Add station code of begin station
              ['journeySection', 'endTimeTableRow', 'stationShortCode']], # Add station code of end station
        sep='_' # use "_" to concatenate column names, when made up from several contributions (depth > 1)
    ).rename(axis='columns', mapper=lambda s: s.split('_')[-1]) # keep only clumn description of deepest level
    # split all column names at capital letters and concatenate lower cased parts with "_"        
    .rename(axis='columns', mapper=lambda s: re.sub(r'([A-Z])', underscore_lower_match, s).lower())
    .rename(axis='columns', mapper={'length': 'length_mm'})
    .assign(departure_date=lambda s: s.departure_date.astype('datetime64'), # Departure Date as Datetime type
        wagon_type=lambda s: s.wagon_type.astype('category'),
        train_number=lambda s: s.train_number.astype('int32'),
        operator_short_code=lambda s: s.operator_short_code.astype('category'),
        train_category=lambda s: s.train_category.astype('category'),
        train_type=lambda s: s.train_type.astype('category'),
        total_length=lambda s: s.total_length.astype('int32'))
    .astype({'location': 'int32',
            'sales_number': 'int32',
            'length_mm': 'int32'})
)

In [153]:
df = tweak_wagons(data_raw)

In [152]:
tweak_wagons(data_raw).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   wagon_type           2 non-null      category      
 1   location             2 non-null      int32         
 2   sales_number         2 non-null      int32         
 3   length_mm            2 non-null      int32         
 4   pet                  2 non-null      bool          
 5   disabled             2 non-null      bool          
 6   train_number         2 non-null      int32         
 7   operator_short_code  2 non-null      category      
 8   train_category       2 non-null      category      
 9   train_type           2 non-null      category      
 10  departure_date       2 non-null      datetime64[ns]
 11  total_length         2 non-null      int32         
 12  station_short_code   2 non-null      object        
 13  station_short_code   2 non-null      ob

In [61]:
data_raw

[{'trainNumber': 426,
  'departureDate': '2023-08-28',
  'operatorUICCode': 10,
  'operatorShortCode': 'vr',
  'trainCategory': 'Long-distance',
  'trainType': 'HDM',
  'version': 286351593156,
  'journeySections': [{'beginTimeTableRow': {'stationShortCode': 'KEU',
     'stationUICCode': 235,
     'countryCode': 'FI',
     'type': 'DEPARTURE',
     'scheduledTime': '2023-08-28T10:57:00.000Z'},
    'endTimeTableRow': {'stationShortCode': 'HPK',
     'stationUICCode': 200,
     'countryCode': 'FI',
     'type': 'ARRIVAL',
     'scheduledTime': '2023-08-28T11:11:00.000Z'},
    'locomotives': [{'location': 1,
      'locomotiveType': 'Dm12',
      'powerType': 'Diesel'}],
    'wagons': [{'wagonType': 'Dm12',
      'location': 1,
      'salesNumber': 1,
      'length': 2520,
      'pet': True,
      'disabled': True}],
    'totalLength': 25,
    'maximumSpeed': 120,
    'attapId': 329699415,
    'saapAttapId': 329699465},
   {'beginTimeTableRow': {'stationShortCode': 'HPK',
     'stationUICC

In [13]:
data_raw

[{'trainNumber': 70,
  'departureDate': '2023-08-25',
  'operatorUICCode': 10,
  'operatorShortCode': 'vr',
  'trainCategory': 'Long-distance',
  'trainType': 'IC',
  'version': 286333719793,
  'journeySections': [{'beginTimeTableRow': {'stationShortCode': 'OL',
     'stationUICCode': 370,
     'countryCode': 'FI',
     'type': 'DEPARTURE',
     'scheduledTime': '2023-08-25T09:24:00.000Z'},
    'endTimeTableRow': {'stationShortCode': 'KV',
     'stationUICCode': 480,
     'countryCode': 'FI',
     'type': 'ARRIVAL',
     'scheduledTime': '2023-08-25T16:52:00.000Z'},
    'locomotives': [{'location': 1,
      'locomotiveType': 'Sr2',
      'powerType': 'Electric'}],
    'wagons': [{'wagonType': 'Ed',
      'location': 2,
      'salesNumber': 6,
      'length': 2640},
     {'wagonType': 'Ed', 'location': 3, 'salesNumber': 5, 'length': 2640},
     {'wagonType': 'Eds',
      'location': 4,
      'salesNumber': 4,
      'length': 2640,
      'playground': True,
      'disabled': True},
     

In [157]:
type(data_raw)

list

In [54]:
isinstance(json_dict['journeySections'], type(dict))

{0: [{'beginTimeTableRow': {'stationShortCode': 'PRI',
    'stationUICCode': 220,
    'countryCode': 'FI',
    'type': 'DEPARTURE',
    'scheduledTime': '2023-08-25T07:03:00.000Z'},
   'endTimeTableRow': {'stationShortCode': 'TPE',
    'stationUICCode': 160,
    'countryCode': 'FI',
    'type': 'ARRIVAL',
    'scheduledTime': '2023-08-25T08:36:00.000Z'},
   'locomotives': [{'location': 4,
     'locomotiveType': 'Sr2',
     'powerType': 'Electric'}],
   'wagons': [{'wagonType': 'Edo',
     'location': 1,
     'salesNumber': 1,
     'length': 2740,
     'pet': True},
    {'wagonType': 'CEd', 'location': 2, 'salesNumber': 2, 'length': 2640},
    {'wagonType': 'Edfs',
     'location': 3,
     'salesNumber': 4,
     'length': 2640,
     'playground': True,
     'disabled': True}],
   'totalLength': 99,
   'maximumSpeed': 140,
   'attapId': 365205565,
   'saapAttapId': 365207015}]}

In [48]:
json_dict

{'trainNumber': {0: 464},
 'departureDate': {0: '2023-08-25'},
 'operatorUICCode': {0: 10},
 'operatorShortCode': {0: 'vr'},
 'trainCategory': {0: 'Long-distance'},
 'trainType': {0: 'IC'},
 'version': {0: 286330743404},
 'journeySections': {0: [{'beginTimeTableRow': {'stationShortCode': 'PRI',
     'stationUICCode': 220,
     'countryCode': 'FI',
     'type': 'DEPARTURE',
     'scheduledTime': '2023-08-25T07:03:00.000Z'},
    'endTimeTableRow': {'stationShortCode': 'TPE',
     'stationUICCode': 160,
     'countryCode': 'FI',
     'type': 'ARRIVAL',
     'scheduledTime': '2023-08-25T08:36:00.000Z'},
    'locomotives': [{'location': 4,
      'locomotiveType': 'Sr2',
      'powerType': 'Electric'}],
    'wagons': [{'wagonType': 'Edo',
      'location': 1,
      'salesNumber': 1,
      'length': 2740,
      'pet': True},
     {'wagonType': 'CEd', 'location': 2, 'salesNumber': 2, 'length': 2640},
     {'wagonType': 'Edfs',
      'location': 3,
      'salesNumber': 4,
      'length': 2640,


In [42]:
isinstance(json_dict['journeySections'], type(dict))

True