# A Jupyter notebook to read tracking files and (ultimately) create a data package representation of these 

In [1]:
# import needed libraries
import sys
import io
import os
import csv
import pandas as pd
import tabulator 
from tabulator import Stream

In [2]:
path = 'C://Users/Paola//Documents//UGent//PostDoc//DataPackage' # path to the folder where the csv files are
os.chdir(path)
# this is the tracking file we are reading
# f = 'trajectories.csv' example out of Sami's toolbox
f = 'tracks_fMLP.csv' #example out of Essen's commercial toolbox

In [3]:
# a quick visualization of the file
with Stream(f, headers=1) as stream:
    print(stream.headers) # will print headers from 1 row
    for row in stream:
        print(row)

['Line', 'Track N', 'Time Sample N', 'X', 'Y']
['1', '1', '1', '144', '308']
['2', '1', '2', '136', '296']
['3', '1', '3', '124', '296']
['4', '1', '4', '124', '296']
['5', '1', '5', '116', '292']
['6', '1', '6', '112', '292']
['7', '1', '7', '120', '296']
['8', '1', '8', '116', '296']
['9', '2', '1', '192', '416']
['10', '2', '2', '200', '404']
['11', '2', '3', '200', '404']
['12', '2', '4', '204', '404']
['13', '2', '5', '212', '400']
['14', '2', '6', '204', '388']
['15', '2', '7', '200', '388']
['16', '2', '8', '212', '376']
['17', '2', '9', '212', '368']
['18', '2', '10', '208', '360']
['19', '2', '11', '208', '360']
['20', '2', '12', '204', '352']
['21', '2', '13', '204', '348']
['22', '2', '14', '196', '340']
['23', '2', '15', '196', '340']
['24', '2', '16', '188', '332']
['25', '2', '17', '184', '324']
['26', '2', '18', '176', '312']
['27', '2', '19', '172', '312']
['28', '2', '20', '168', '308']
['29', '2', '21', '164', '300']
['30', '2', '22', '156', '296']
['31', '2', '23', '

In [4]:
import pandas as pd

In [5]:
# getting the file into a pandas dataframe
df = pd.read_csv(f)

In [6]:
df.head()

Unnamed: 0,Line,Track N,Time Sample N,X,Y
0,1,1,1,144,308
1,2,1,2,136,296
2,3,1,3,124,296
3,4,1,4,124,296
4,5,1,5,116,292


In [7]:
# the user will need to map the joint_identifier!
#joint_identifier = 'track_no'
joint_identifier = 'Track N'

In [8]:
# do the grouping by the joint identifier
grouped = df.groupby(joint_identifier)

In [9]:
# a local check
for name, group in grouped:
    print(name)
    print(group)

1
   Line  Track N  Time Sample N    X    Y
0     1        1              1  144  308
1     2        1              2  136  296
2     3        1              3  124  296
3     4        1              4  124  296
4     5        1              5  116  292
5     6        1              6  112  292
6     7        1              7  120  296
7     8        1              8  116  296
2
    Line  Track N  Time Sample N    X    Y
8      9        2              1  192  416
9     10        2              2  200  404
10    11        2              3  200  404
11    12        2              4  204  404
12    13        2              5  212  400
13    14        2              6  204  388
14    15        2              7  200  388
15    16        2              8  212  376
16    17        2              9  212  368
17    18        2             10  208  360
18    19        2             11  208  360
19    20        2             12  204  352
20    21        2             13  204  348
21    22        

In [10]:
# the number of events for each joint_identifier (track identifier)
grouped.size()

Track N
1        8
2       24
3       24
4       33
5       11
6       37
7       19
8       31
9      149
10     163
11      23
12      55
13     135
14      27
15      37
16     188
17      52
18     340
19     451
20      87
21      47
22      96
23     174
24      56
25      38
26      87
27      20
28      46
29      53
30       9
      ... 
229     33
230     61
231      9
232     15
233      8
234      9
235     54
236     42
237     45
238     44
239     11
240     12
241     21
242     35
243     29
244     23
245     27
246     30
247     28
248     26
249     25
250     24
251     22
252     21
253     15
254     13
255     11
256     11
257     10
258      8
dtype: int64

In [11]:
# put everything back into a dataframe
dfs = []
for name, group in grouped:
    df = group.reset_index()
    dfs.append(df)

In [12]:
objects_df = pd.concat(dfs)
events_df = pd.DataFrame(grouped.size())
events_df = events_df.reset_index()

In [13]:
events_df.columns = [joint_identifier, 'events_size']

In [14]:
events_df

Unnamed: 0,Track N,events_size
0,1,8
1,2,24
2,3,24
3,4,33
4,5,11
5,6,37
6,7,19
7,8,31
8,9,149
9,10,163


In [15]:
# write the dataframes to csv
objects_df.to_csv('objects.csv', index=False)
events_df.to_csv('events.csv', index=False)

In [16]:
import datapackage as dp
import jsontableschema as jt

In [17]:
# import the infer from the jsontableschema
from jsontableschema import infer

In [18]:
filepath = 'objects.csv'

In [None]:
with io.open(filepath) as stream:
    headers = stream.readline().rstrip('\n').split(',')
    values = csv.reader(stream)
    # infer the schema
    schema = infer(headers, values)

    dp.descriptor['resources'] = [
        {
            'name': 'data',
            'path': filepath,
            'schema': schema
        }
    ]

In [None]:
# write the datapackage.json to file
with open('datapackage.json', 'w') as f:
    f.write(dp.to_json())