In [297]:
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from matplotlib.dates import DateFormatter

plt.style.use('seaborn-whitegrid')

## Базовая реструктуризация данных

In [329]:
data_dir = Path('../data/2021_06_01/').resolve()
data_dir

PosixPath('/home/yuralytkin/Development/rzd-fares-analysis/data/2021_06_01')

In [383]:
def load_train_data(
        train_num: str,
        train_class: str,
) -> Optional[pd.DataFrame]:
    places_fpath = data_dir / f'{train_num}-{train_class}-place.csv'
    prices_fpath = data_dir / f'{train_num}-{train_class}-price.csv'

    if not all(fpath.exists() for fpath in [places_fpath, prices_fpath]):
        return

    places = pd.read_csv(places_fpath, encoding='cp1251', sep=';', index_col=0).T
    prices = pd.read_csv(prices_fpath, encoding='cp1251', sep=';', index_col=0).T

    for df in [places, prices]:
        df.index = pd.to_datetime(df.index)
        df.index.name = 'date'
        df.columns = df.columns.map(int)
        df.columns.name = 'days'
        
    max_places = places.max().max()
    
    if max_places == 0:
        return
        
    places = places.reset_index().melt(id_vars='date', var_name='days', value_name='places')
    places['places_frac'] = places['places'] / max_places
    
    prices = prices.replace(0, np.nan)
    prices = prices.reset_index().melt(id_vars='date', var_name='days', value_name='price')

    train_data = places.merge(prices, how='outer', on=['date', 'days'])
    
    train_data = train_data.dropna(subset=['places', 'price'], how='all')
        
    train_data['num'] = train_num
    train_data['class'] = train_class

    return train_data

In [384]:
fname_re = re.compile('([^-]+)-([^-]+)-(price|place).csv')

trains = set()

for fpath in data_dir.iterdir():
    match = fname_re.fullmatch(fpath.name)
    
    if match is None:
        continue
        
    train_num, train_class, _ = match.groups()
    trains.add((train_num, train_class))
    
trains = sorted(trains)
    
data = pd.DataFrame()

for train_num, train_class in tqdm(trains):
    train_data = load_train_data(train_num, train_class)
    
    if train_data is not None:
        data = data.append(train_data, ignore_index=True)
        
data.head(2)

  0%|          | 0/380 [00:00<?, ?it/s]

Unnamed: 0,date,days,places,places_frac,price,num,class
0,2021-04-03,1,62.0,0.442857,3275.0,001Ð,ÐÑÐ¿Ðµ
1,2021-04-04,1,33.0,0.235714,3913.0,001Ð,ÐÑÐ¿Ðµ


In [385]:
trains = data[['num', 'class']].drop_duplicates().sort_values(by=['num', 'class']).values.tolist()
len(trains)

144

### Исправление косячных кодировок (если нужно)

## Препроцессинг

### Удаляем данные за праздники (29 апреля — 12 мая)

In [386]:
start = datetime(year=2021, month=4, day=29)
end = datetime(year=2021, month=5, day=12)

mask = (data['date'] < start) | (data['date'] > end)
data = data[mask].copy()

data.head(2)

Unnamed: 0,date,days,places,places_frac,price,num,class
0,2021-04-03,1,62.0,0.442857,3275.0,001Ð,ÐÑÐ¿Ðµ
1,2021-04-04,1,33.0,0.235714,3913.0,001Ð,ÐÑÐ¿Ðµ


### Заполненность строк для разных поездов

In [387]:
filled_frac = pd.DataFrame({
    (train_num, train_class): data[(data['num'] == train_num) & (data['class'] == train_class)]\
                                .pivot(index='date', columns='days', values='places')\
                                .notna().mean(axis=1)
    for train_num, train_class in trains
})

idx = np.arange(filled_frac.index.min(), filled_frac.index.max() + timedelta(days=1), timedelta(days=1))
filled_frac = filled_frac.reindex(idx).fillna(0)

ax = plt.figure(figsize=(20, 8)).gca()
sns.heatmap(filled_frac, vmin=0, vmax=1, ax=ax, xticklabels=True, yticklabels=True, cmap='coolwarm', cbar_kws={'aspect': 40})

yticklabels = ax.get_yticklabels()

for text in yticklabels:
    text.set_text(text.get_text()[:10])

ax.set_yticklabels(yticklabels)
ax.set_xlabel('train')
ax.tick_params(axis='both', which='major', labelsize=6)

ax.set_title('Fraction of filled values for different trains')

plt.savefig('../data/figures/filled_frac.pdf', transparent=True, bbox_inches='tight')
plt.close()

  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)


### Удаляем поезда, которые ездят не каждый день, либо имеют слишком много пропусков

In [388]:
train_nums_to_remove = ['761А', '762А', '763А', '764А', '765А', '766А']

data = data[~data['num'].isin(train_nums_to_remove)].copy()

In [390]:
trains = data[['num', 'class']].drop_duplicates().sort_values(by=['num', 'class']).values.tolist()
len(trains)

144

### Ищем «уикенды»

In [391]:
data['weekday'] = data['date'].dt.weekday
data['day_name'] = data['date'].dt.day_name()

weekdays = data[['weekday', 'day_name']].drop_duplicates().sort_values(by='weekday')\
    .set_index('weekday')['day_name'].to_dict()

weekdays

{0: 'Monday',
 1: 'Tuesday',
 2: 'Wednesday',
 3: 'Thursday',
 4: 'Friday',
 5: 'Saturday',
 6: 'Sunday'}

In [392]:
ncols = 6
nrows = len(trains) // ncols + (len(trains) % ncols > 0)

fig, axes = plt.subplots(ncols=ncols, nrows=nrows)
fig.set_size_inches(3 * ncols, 3 * nrows)
fig.subplots_adjust(wspace=0.4, hspace=0.5)
axes = axes.flatten()

for (train_num, train_class), ax in zip(trains, axes):
    train_data = data[(data['num'] == train_num) & (data['class'] == train_class)]

    grouped = train_data.groupby(['weekday', 'days'])['places'].mean().reset_index()\
        .pivot(index='weekday', columns='days', values='places')
    
    for weekday, row in grouped.iterrows():
        ax.plot(row, c=f'C{weekday}', label=weekdays[weekday])
    
    ax.set_title(f'{train_num}, {train_class}')
    ax.invert_xaxis()
    
for i in range(1, nrows, 3):
    axes[(i + 1) * ncols - 1].legend(loc='upper left', bbox_to_anchor=(1, 1))

for i in range(len(trains), len(axes)):
    axes[i].axis('off')
    
plt.savefig('../data/figures/places_vs_weekday.pdf', transparent=True, bbox_inches='tight')
plt.close()

  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=ft2font.LOAD_NO_HINTING)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)


_Добавить сюда категоризацию «уикендов» для каждого поезда._

In [393]:
data['weekend'] = 0

data.head(2)

Unnamed: 0,date,days,places,places_frac,price,num,class,weekday,day_name,weekend
0,2021-04-03,1,62.0,0.442857,3275.0,001Ð,ÐÑÐ¿Ðµ,5,Saturday,0
1,2021-04-04,1,33.0,0.235714,3913.0,001Ð,ÐÑÐ¿Ðµ,6,Sunday,0


In [394]:
data.to_csv('../data/data.csv', index=False, encoding='utf-8')

In [399]:
data.loc[144644]

date           2021-05-16 00:00:00
days                             4
places                        32.0
places_frac                    1.0
price                      19185.0
num                          774Ð
class                           B1
weekday                          6
day_name                    Sunday
weekend                          0
Name: 144644, dtype: object

In [404]:
data.groupby(['num', 'class'])['places'].max().reset_index().sort_values(by=['class', 'num']).head(40)

Unnamed: 0,num,class,places
32,751Ð,B1,17.0
36,752Ð,B1,17.0
40,753Ð,B1,17.0
44,754Ð,B1,34.0
48,755Ð,B1,34.0
52,756Ð,B1,17.0
56,757Ð,B1,30.0
60,759Ð,B1,17.0
64,760Ð,B1,17.0
68,761Ð,B1,17.0


In [398]:
data.loc[data['num'].str.startswith('774') & data['class'].str.startswith('B1'), ['places', 'places_frac']].sort_values(by='places', ascending=False)

Unnamed: 0,places,places_frac
144644,32.0,1.0000
145037,30.0,0.9375
144871,30.0,0.9375
144982,30.0,0.9375
144816,30.0,0.9375
...,...,...
144501,0.0,0.0000
144500,0.0,0.0000
144443,0.0,0.0000
144463,0.0,0.0000
