In [1]:
import os
import pandas as pd
import datetime
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

In [2]:
df = pd.DataFrame()
for f in [x for x in os.listdir("data") if x[2] == "_"]:
    df = pd.concat([df, pd.read_parquet(os.path.join("data",f))])

In [3]:
df['oscrapovano_minuty'] = df['oscrapovano'].apply(lambda x: str(x)[0:14])

In [4]:
df['oscrapovano_minuty']

0        2024-11-07 18:
1        2024-11-07 18:
2        2024-11-07 18:
3        2024-11-07 18:
4        2024-11-07 18:
              ...      
15225    2024-11-20 10:
15226    2024-11-20 10:
15227    2024-11-20 00:
15228    2024-11-20 00:
15229    2024-11-20 00:
Name: oscrapovano_minuty, Length: 1363357, dtype: object

In [5]:
df = df.drop_duplicates(subset=['odkud','kam','odjezd','oscrapovano_minuty'], keep="last")

In [6]:
len(df)

661211

In [7]:
df.groupby(['prodejce',pd.Grouper(key='oscrapovano',freq='D')]).size()

prodejce  oscrapovano
ARR       2024-11-07      2297
          2024-11-08      7128
          2024-11-09      3320
          2024-11-10      3157
          2024-11-11      3367
          2024-11-12      3323
          2024-11-13      3632
          2024-11-14      3072
          2024-11-15      3552
          2024-11-16      3498
          2024-11-17      3254
          2024-11-18      3310
          2024-11-19      3384
          2024-11-20      3393
LE        2024-11-07       332
          2024-11-08      1573
          2024-11-09      1873
          2024-11-10      1755
          2024-11-11      1697
          2024-11-12      1889
          2024-11-13      1886
          2024-11-14      1981
          2024-11-15      2063
          2024-11-16      2002
          2024-11-17      2004
          2024-11-18      1988
          2024-11-19      2050
          2024-11-20      2042
RJ        2024-11-06      3676
          2024-11-07      1853
          2024-11-08     11164
          2024-11

In [8]:
df = df.sort_values(by="oscrapovano").reset_index(drop=True)

In [9]:
df = df.dropna(subset=['odkud','kam','odjezd','oscrapovano'],how='any')

In [10]:
df = df[df['prostredek'] != 'autobus']

In [11]:
days = {0: 'po', 1: 'út', 2: 'st', 3: 'čt', 
        4: 'pá', 5: 'so', 6: 'ne'}
df['den'] = df['odjezd'].dt.dayofweek.map(days)

In [12]:
df['predstih_d'] = df['predstih'].dt.days
df['predstih_h'] = df['predstih'].dt.total_seconds() / 3600

In [13]:
df = df[df['predstih_h'] > -3]

In [14]:
kategoricka_data = ['odkud','kam','prodejce','den']

In [15]:
for k in kategoricka_data:
    print("Before:", df[k].memory_usage(deep=True))
    df[k] = df[k].astype('category')
    print("After: ", df[k].memory_usage(deep=True))

Before: 42215968
After:  5251224
Before: 42212588
After:  5249720
Before: 44052596
After:  5242387
Before: 38610880
After:  5242707


In [16]:
df['cena'] = pd.to_numeric(df['cena'])

In [17]:
df.shape

(582443, 26)

In [18]:
df[['odjezd','odkud','kam']].drop_duplicates().shape

(56715, 3)

In [19]:
poradi = ['oscrapovano','prodejce','odkud','kam',
 'odjezd',
 'predstih',
 'predstih_d',
 'predstih_h',
          'cena',
 'prostredek',
          'volnych_mist',
 'obsazenost',
 'jizdni_doba',
 'vzdalenost',
 'zpozdeni',
 'cena_poznamka',
 'den',
 'prestupy',
'vlaky',
 'mistenka_zdarma',
 'nahradni_bus',
 'volna_mista_economy',
 'volna_mista_economy_plus',
 'volna_mista_economy_business',
 'volna_mista_premium']

In [20]:
df[poradi].to_parquet(os.path.join("data","jizdenky.parquet"))

In [21]:
nejnovejsi = df['oscrapovano'].max()
nejnovejsi
df_tyden = df[df['oscrapovano'] > (nejnovejsi - datetime.timedelta(hours=168))]

In [22]:
df_tyden[poradi].to_csv(os.path.join("data","jizdenky_tyden.csv"))