In [1]:
import os
import warnings
import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime

warnings.filterwarnings("ignore")

In [2]:
dates = ['FLTDAT', 'DAIS']
dtypes = {'FLTNUM': 'int16', 'TARIF_RUB': 'float16'}
bb = pd.read_csv('F:\\SVO-AER_2018-01-01--2019-12-31BB.csv',
                 delimiter=';', dtype=dtypes, parse_dates=dates)
bb = bb.dropna(how='any', axis=0)
bb.drop_duplicates(inplace=True)
bb['DEPARTURE'] = bb['FLTDAT'] + pd.to_timedelta(bb['FLTTIME'])
bb = bb[['FLTDAT', 'DEPARTURE', 'FLTNUM']].drop_duplicates()
bb.head()

Unnamed: 0,FLTDAT,DEPARTURE,FLTNUM
0,2018-01-21,2018-01-21 13:35:00,1124
1,2018-01-10,2018-01-10 09:10:00,1122
2,2018-01-11,2018-01-11 09:10:00,1122
3,2018-01-26,2018-01-26 17:35:00,1140
4,2018-01-06,2018-01-06 09:10:00,1122


In [3]:
%%time
dates = ['SDAT_S', 'DD']
dtypes = {'FLT_NUM': 'int16', 'FCLCLD': 'int16', 'PASS_BK': 'int16','SA': 'int16',
          'AU': 'int16', 'PASS_DEP': 'int16', 'NS': 'int16', 'DTD': 'int16'}
hh = pd.read_csv('F:\\hh_for_profiles.csv', dtype=dtypes, parse_dates=dates)

Wall time: 45.8 s


In [40]:
def get_flight(fltnum: int, date: datetime, flttime: pd.Timedelta) -> (str, pd.DataFrame):
    """Получение данных рейса для восстановления спроса"""
    departure = date + flttime
    flight = bb[(bb['FLTNUM'] == fltnum) & (bb['DEPARTURE'] == departure)]
    print(f"Рейс:", fltnum, departure)
    flight = pd.merge(flight, hh, left_on=['FLTNUM', 'FLTDAT'], right_on=['FLT_NUM', 'DD'])
    flight = flight.dropna(how='any', axis=0)
    flight.drop(columns=['FLTDAT', 'FLT_NUM', 'DD'], inplace=True)
    flight.drop_duplicates(inplace=True)
    print(f"Данных: {flight.shape[0]:_}")
    return f"{fltnum}, {departure}", flight

def get_regression(data: pd.DataFrame, code: str) -> pd.DataFrame:
    """Получение коэффициентов линейной регрессии"""
    # фильрация для одного из классов продаж:
    data = data[data['SEG_CLASS_CODE'] == code]
    # все дней продаж билетов:
    Ts = max(data['DTD'].unique())
    # если время продаж билетов < 20%:
    if len(data[data['FCLCLD'] == 0]) / len(data) < .2:
        return 0, 0, Ts
    # фильрация по времени нецензурирования:
    data = data[data['FCLCLD'] == 0]
    # числа для МНК:
    z = data['PASS_BK'].values
    t = data['DTD'].values.reshape(-1, 1)
    # если после фильра осталась 1 или 0 точек (линейная регрессия не существует):
    if z.size <= 1:
        return 0, 0, Ts
    # коэффициенты линейной ресгрессии:
    reg = LinearRegression().fit(t, z)
    B1, B0 = reg.coef_[0], reg.intercept_
    return B1, B0, Ts

def get_Ts_streak(B1: np.float64, B0: np.float64, Ts: np.int16):
    """Получение значения Т штрих"""
    if B1 <= 0:
        return Ts  # max(DTD)
    Ts_streak = -B0 / B1
    if Ts_streak < 0 or Ts_streak > Ts:
        return Ts  # max(DTD)
    return Ts_streak

def get_restored_demand(Ts_streak: (np.float64 or np.int16), class_code: str):
    """Получение восстановленного спроса"""
    flight_class = flight[flight['SEG_CLASS_CODE'] == class_code]
    flight_class = flight_class[['DTD', 'PASS_BK', 'PASS_DEP', 'FCLCLD']]
    flight_class = flight_class.sort_values(by=['DTD'], ascending=False).reset_index(drop=True)
    # первый последний день закрытия продаж
    first_last_one = np.where(flight_class['FCLCLD'] == 0)[0]
    # случай, если продажи не были закрыты в 0 день (спрос уже восстановленный):
    restored_demand_fail = flight_class[flight_class['DTD'].isin([-1, 0])]
    if len(first_last_one) == 0:
        restored_demand_fail['restored'] = restored_demand_fail['PASS_BK']
        return restored_demand_fail
    first_last_one = first_last_one[-1] + 1
    restored_demand = flight_class[first_last_one:].reset_index(drop=True)
    # случай, если продажи не были закрыты даже в -1 день (спрос уже восстановленный):
    if len(restored_demand) == 0:
        restored_demand_fail['restored'] = restored_demand_fail['PASS_BK']
        return restored_demand_fail
    # количество дней последнего цензурирования до вылета:
    Tc = max(restored_demand['DTD'].unique())
    # бронирования класса на момент последнего закрытия продаж:
    Zc = restored_demand[restored_demand['DTD'] == Tc]['PASS_BK']
    restored_demand['restored'] = restored_demand.apply(
        lambda x: (Zc * (Ts_streak - x['DTD']) ** 2) /
                  (     (Ts_streak - Tc) ** 2)
        , axis=1
    )
    restored_demand.rename_axis(class_code, axis="columns", inplace=True)
    return restored_demand

In [7]:
%%time
flight_name, flight = get_flight(1122,
                                 datetime.strptime('2018-01-06', '%Y-%m-%d'),
                                 pd.Timedelta('09:10:00'))

Рейс: 1122 2018-01-06 09:10:00
Данных: 4_158
Wall time: 1.97 s


In [39]:
classes, pass_dep, pass_bk = [], [], []
for class_code in flight['SEG_CLASS_CODE'].unique():
    B1, B0, Ts = get_regression(flight, class_code)
    Ts_streak = get_Ts_streak(B1, B0, Ts)
    flight_class = get_restored_demand(Ts_streak, class_code)
#     display(flight_class)
    classes.append(class_code)
    pass_dep.append(flight_class[flight_class['DTD'] == -1]['PASS_DEP'].values[0])
    pass_bk.append(flight_class[flight_class['DTD'] == 0]['restored'].values[0])
restored_flight = pd.DataFrame({'pass_dep': pass_dep, 'pass_bk_restored': pass_bk}, index=classes)
restored_flight.rename_axis(flight_name, axis="columns", inplace=True)
restored_flight

"1122, 2018-01-06 09:10:00",pass_dep,pass_bk_restored
J,0,0.0
C,0,0.0
D,3,3.0
I,0,0.0
Z,0,0.0
O,6,6.055944
Y,0,0.0
B,0,0.0
M,0,0.0
U,0,0.0
