In [229]:
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

plt.style.use('seaborn-whitegrid')

### Загрузка данных

In [259]:
data_dir = Path('../data/2021_05_27/').resolve()
data_dir

PosixPath('/Users/yuralytkin/Development/work/itmo/rzd-fares-analysis/data/2021_05_27')

In [263]:
def load_train_data(
        train_num: str,
        train_class: str,
        maxdays: Optional[int] = None,
) -> Optional[pd.DataFrame]:
    places_fpath = data_dir / f'{train_num}-{train_class}-place.csv'
    prices_fpath = data_dir / f'{train_num}-{train_class}-price.csv'

    if not all(fpath.exists() for fpath in [places_fpath, prices_fpath]):
        return

    places = pd.read_csv(places_fpath, encoding='cp1251', sep=';', index_col=0).T
    prices = pd.read_csv(prices_fpath, encoding='cp1251', sep=';', index_col=0).T

    for df in [places, prices]:
        df.index = pd.to_datetime(df.index)
        df.index.name = 'date'
        df.columns = df.columns.map(int)
        df.columns.name = 'days'
        
        if maxdays is not None:
            df.drop(columns=df.columns[df.columns > maxdays], inplace=True)
        
    places /= places.max().max()
    prices = prices.replace(0, np.nan)

    missing = places[places.isna().mean(axis=1) > 0.2].index

    for df in [places, prices]:
        df.drop(index=missing, inplace=True)

        if df.empty:
            return

    places = places.reset_index().melt(id_vars='date', var_name='days', value_name='places')
    prices = prices.reset_index().melt(id_vars='date', var_name='days', value_name='price')

    train_data = places.merge(prices, how='outer', on=['date', 'days'])
        
    train_data['num'] = train_num
    train_data['class'] = train_class

    return train_data

In [264]:
fname_re = re.compile('([^-]+)-([^-]+)-(price|place).csv')

trains = set()

for fpath in data_dir.iterdir():
    match = fname_re.fullmatch(fpath.name)
    
    if match is None:
        continue
        
    train_num, train_class, _ = match.groups()
    trains.add((train_num, train_class))
    
trains = sorted(trains)
    
data = pd.DataFrame()

for train_num, train_class in tqdm(trains):
    train_data = load_train_data(train_num, train_class, maxdays=15)
    
    if train_data is not None:
        data = data.append(train_data, ignore_index=True)
        
trains = data[['num', 'class']].drop_duplicates().values.tolist()
        
data.head(2)

  0%|          | 0/376 [00:00<?, ?it/s]

Unnamed: 0,date,days,places,price,num,class
0,2021-04-06,1,0.623188,3161.0,001€,Šã¯¥
1,2021-04-07,1,0.695652,3275.0,001€,Šã¯¥


In [265]:
num_charmap = {
    '€': 'А',
    '¨': 'Н',
}


def decode(s: str) -> str:
    return ''.join(map(lambda x: num_charmap.get(x, x), s))


data['num'] = data['num'].apply(decode)

data.head(2)

Unnamed: 0,date,days,places,price,num,class
0,2021-04-06,1,0.623188,3161.0,001А,Šã¯¥
1,2021-04-07,1,0.695652,3275.0,001А,Šã¯¥


In [266]:
classes = sorted(data['class'].unique())
classes

['B1', 'B2', 'C1', 'C2', 'Šã¯¥', '‘Ø¤ļēØ©', '‘‚']

In [267]:
fixed_classes = ['B1', 'B2', 'C1', 'C2', 'Купе', 'Сидячий', 'СВ']
assert len(classes) == len(fixed_classes)

class_map = dict(zip(classes, fixed_classes))


def decode(s: str) -> str:
    return class_map.get(s, s)


data['class'] = data['class'].apply(decode)

data.head(2)

Unnamed: 0,date,days,places,price,num,class
0,2021-04-06,1,0.623188,3161.0,001А,Купе
1,2021-04-07,1,0.695652,3275.0,001А,Купе


In [268]:
data['class'].unique()

array(['Купе', 'СВ', 'Сидячий', 'B1', 'B2', 'C1', 'C2'], dtype=object)

In [269]:
trains = data[['num', 'class']].drop_duplicates().values.tolist()

In [270]:
len(trains)

144

### Удаляем данные за праздники (29 апреля — 12 мая)

In [271]:
start = datetime(year=2021, month=4, day=29)
end = datetime(year=2021, month=5, day=12)

mask = (data['date'] < start) | (data['date'] > end)
data = data[mask].copy()

### Ищем «уикенды»

In [272]:
data['weekday'] = data['date'].dt.weekday
data['day_name'] = data['date'].dt.day_name()

weekdays = data[['weekday', 'day_name']].drop_duplicates().sort_values(by='weekday')\
    .set_index('weekday')['day_name'].to_dict()

weekdays

{0: 'Monday',
 1: 'Tuesday',
 2: 'Wednesday',
 3: 'Thursday',
 4: 'Friday',
 5: 'Saturday',
 6: 'Sunday'}

In [273]:
ncols = 6
nrows = len(trains) // ncols + (len(trains) % ncols > 0)

fig, axes = plt.subplots(ncols=ncols, nrows=nrows)
fig.set_size_inches(3 * ncols, 3 * nrows)
fig.subplots_adjust(wspace=0.4, hspace=0.5)
axes = axes.flatten()

for (train_num, train_class), ax in zip(trains, axes):
    train_data = data[(data['num'] == train_num) & (data['class'] == train_class)]

    grouped = train_data.groupby(['weekday', 'days'])['places'].mean().reset_index()\
        .pivot(index='weekday', columns='days', values='places')
    
    for weekday, row in grouped.iterrows():
        ax.plot(row, c=f'C{weekday}', label=weekdays[weekday])
    
    ax.set_title(f'{train_num}, {train_class}')
    
for i in range(1, nrows, 3):
    axes[(i + 1) * ncols - 1].legend(loc='upper left', bbox_to_anchor=(1, 1))

for i in range(len(trains), len(axes)):
    axes[i].axis('off')
    
plt.savefig('../data/figures/places_vs_weekday.pdf', transparent=True, bbox_inches='tight')
plt.close()

_Добавить сюда категоризацию «уикендов» для каждого поезда._

In [274]:
data['weekend'] = 0

data.head(2)

Unnamed: 0,date,days,places,price,num,class,weekday,day_name,weekend
0,2021-04-06,1,0.623188,3161.0,001А,Купе,1,Tuesday,0
1,2021-04-07,1,0.695652,3275.0,001А,Купе,2,Wednesday,0


In [275]:
data.to_csv('../data/data.csv', index=False)