In [95]:
import os
import pandas as pd
import datetime
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

In [96]:
df = pd.DataFrame()
for f in [x for x in os.listdir("data") if x[2] == "_"]:
    df = pd.concat([df, pd.read_parquet(os.path.join("data",f))])

In [97]:
df['oscrapovano_minuty'] = df['oscrapovano'].apply(lambda x: str(x)[0:15])

In [98]:
df['oscrapovano_minuty']

0        2024-11-07 18:3
1        2024-11-07 18:3
2        2024-11-07 18:3
3        2024-11-07 18:3
4        2024-11-07 18:3
              ...       
12797    2024-11-10 00:0
12798    2024-11-10 00:0
12799    2024-11-10 14:1
12800    2024-11-10 14:1
12801    2024-11-10 14:1
Name: oscrapovano_minuty, Length: 528765, dtype: object

In [99]:
df = df.drop_duplicates(subset=['odkud','kam','odjezd','oscrapovano_minuty'], keep="last")

In [100]:
df.groupby(['prodejce',pd.Grouper(key='oscrapovano',freq='D')]).size()

prodejce  oscrapovano
ARR       2024-11-07      2497
          2024-11-08      7238
          2024-11-09      3655
          2024-11-10      3242
LE        2024-11-07       332
          2024-11-08      1615
          2024-11-09      1873
          2024-11-10      1755
RJ        2024-11-06      3877
          2024-11-07      1927
          2024-11-08     12228
          2024-11-09     14652
          2024-11-10     12683
ČD        2024-10-31      1558
          2024-11-01     15717
          2024-11-02     21853
          2024-11-03     17590
          2024-11-04     23195
          2024-11-05     22982
          2024-11-06     23426
          2024-11-07     22759
          2024-11-08     23954
          2024-11-09     25076
          2024-11-10     22781
dtype: int64

In [101]:
df = df.sort_values(by="oscrapovano").reset_index(drop=True)

In [102]:
df['cena'] = pd.to_numeric(df['cena'])

In [103]:
df = df.dropna(subset=['odkud','kam','odjezd','oscrapovano'],how='any')

In [104]:
df = df[df['prostredek'] != 'autobus']

In [105]:
days = {0: 'po', 1: 'út', 2: 'st', 3: 'čt', 
        4: 'pá', 5: 'so', 6: 'ne'}
df['den'] = df['odjezd'].dt.dayofweek.map(days)

In [106]:
df['predstih_d'] = df['predstih'].dt.days
df['predstih_h'] = df['predstih'].dt.total_seconds() / 3600

In [107]:
df = df[df['predstih_h'] > -3]

In [108]:
kategoricka_data = ['odkud','kam','prodejce','den']

In [109]:
for k in kategoricka_data:
    print("Before:", df[k].memory_usage(deep=True))
    df[k] = df[k].astype('category')
    print("After: ", df[k].memory_usage(deep=True))

Before: 19614208
After:  2411240
Before: 19526421
After:  2410297
Before: 19965698
After:  2403328
Before: 17655969
After:  2403648


In [110]:
df['cena'] = pd.to_numeric(df['cena'])

In [111]:
df.shape

(266992, 26)

In [112]:
poradi = ['oscrapovano','prodejce','odkud','kam',
 'odjezd',
 'predstih',
 'predstih_d',
 'predstih_h',
          'cena',
 'prostredek',
          'volnych_mist',
 'obsazenost',
 'jizdni_doba',
 'vzdalenost',
 'zpozdeni',
 'cena_poznamka',
 'den',
 'prestupy',
'vlaky',
 'mistenka_zdarma',
 'nahradni_bus',
 'volna_mista_economy',
 'volna_mista_economy_plus',
 'volna_mista_economy_business',
 'volna_mista_premium']

In [113]:
df[poradi].to_parquet(os.path.join("data","jizdenky.parquet"))

In [114]:
nejnovejsi = df['oscrapovano'].max()
nejnovejsi
df_tyden = df[df['oscrapovano'] > (nejnovejsi - datetime.timedelta(hours=168))]

In [115]:
df_tyden[poradi].to_csv(os.path.join("data","jizdenky_tyden.csv"))