In [169]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error




#загрузка данных
airports = pd.read_csv('airports.csv')
flights = pd.read_csv('flights.csv')

#добавляю колонку с датой, выбираю период(данные за октябрь теряются, другой формат наименования аэропортов)
flights['DATE_DAY'] = pd.to_datetime(flights[['YEAR', 'MONTH', 'DAY']])
flights = flights.loc[flights['DATE_DAY'].between('2015-01-01', '2015-09-30')]


Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.



In [170]:
#функция для генерации колонок
def make_features(data, max_lag, rolling_mean_size):
    data['year'] = data.DATE_DAY.dt.year
    data['month'] = data.DATE_DAY.dt.month
    data['day'] = data.DATE_DAY.dt.day
    data['dayofweek'] = data.DATE_DAY.dt.dayofweek
    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['ARRIVAL_DELAY'].shift(lag)

#функция для рассчета вероятности
def get_probability(y_test):
    y_test = y_test.reset_index()
    y_test['probability'] = np.where(y_test['ARRIVAL_DELAY']>0, 0, 1)
    return round(y_test['probability'].mean()*100, 2)

In [175]:

#выбор случайного наименования аэропорта
start_airport = np.random.choice(flights['ORIGIN_AIRPORT'].unique()) 

#формируем фрейм со случайно выбранным аэропортом вылета
df = flights.query("ORIGIN_AIRPORT == '{}'".format(start_airport))

#будующий фрейм данных, содержит наименование аэропорта прилета, метрику, вероятность, число перелетов в тестовом аборе данных
total = pd.DataFrame(columns=['name', 'RMSE', 'ARRIVAL_DELAY', 'probability', 'c_air'])

#перебор аэропортов прилета
for d_airport in df['DESTINATION_AIRPORT'].unique():
    d_df = df.query("DESTINATION_AIRPORT == '{}'".format(d_airport))
    
    
    #удаляю выбросы - 1,5 межквантильных размаха из колонки с задержкой прилета
    Q1 = d_df['ARRIVAL_DELAY'].quantile(0.25)
    Q3 = d_df['ARRIVAL_DELAY'].quantile(0.75)
    IQR = Q3 - Q1
    a = d_df[(d_df['ARRIVAL_DELAY'] < Q1-1.5*IQR ) | (d_df['ARRIVAL_DELAY'] > Q3+1.5*IQR)]['ARRIVAL_DELAY'].reset_index()
    
    #список с исключениями
    a = list(a.ARRIVAL_DELAY)
    #исключаю выбросы
    d_df = d_df.query("ARRIVAL_DELAY != @a")



    try:    

        #генерируем показатели
        d_df = d_df.groupby('DATE_DAY')['ARRIVAL_DELAY'].sum().reset_index()
        make_features(d_df,21,7)
        d_df.dropna(inplace=True)
        d_df.set_index('DATE_DAY', inplace=True)


        #формирую выборки - обучающую и тестовую
        X_train,X_test, y_train, y_test = train_test_split(d_df.drop('ARRIVAL_DELAY',axis=1),
                                                        d_df.ARRIVAL_DELAY,
                                                        shuffle = False,# временной ряд, отключаем перемешивание
                                                        test_size = 0.25)

        #обучаю и делаю предсказание
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        pred = lr.predict(X_test)

        #формирую словарь, далее будем накапливать данные в df total
        t = {'name':'{}'.format(d_airport), 'RMSE':round(np.sqrt(mean_squared_error(y_test, pred)), 2), 
        'ARRIVAL_DELAY':round(y_test.mean(), 2), 'probability':get_probability(y_test), 'c_air':len(y_test)}
        total = total.append(t, ignore_index=True)

    except Exception as e:
        continue



#добавляю и аэропорт вылета(нужен для дальнейшего постоения графика, данные в словаре формируют большой желтый point на карте - аэропорт вылета)
t = {'name':'{}'.format(start_airport), 'RMSE':0, 'ARRIVAL_DELAY':0,'probability':total.probability.max()*2}
total = total.append(t, ignore_index=True)
#сортируем данные по метрике и вероятности положительного исхода прилета
total = total.sort_values(by=['RMSE','probability'],ascending=[True, True])
total = total.head(4)


map_air = airports.merge(total, how='right', left_on='IATA_CODE', right_on='name')

print(total.set_index('name')[1:])


fig = px.scatter_mapbox(total,
                        lon = map_air.LONGITUDE,
                        lat = map_air.LATITUDE,
                        zoom = 3,
                        color = map_air.probability,
                        size = map_air.probability,
                        width = 800,
                        height = 600,
)
fig.update_layout(mapbox_style='open-street-map')
fig.show()



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

       RMSE  ARRIVAL_DELAY  probability c_air
name                                         
GSO    8.83         -20.00       100.00     1
ABQ   13.45          -9.88        81.25    16
MDT   14.19          -0.77        61.54    13



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [172]:
total

Unnamed: 0,name,RMSE,ARRIVAL_DELAY,probability,c_air
6,HDN,0.0,0.0,200.0,
0,DEN,9.62,-4.38,75.0,8.0
3,IAH,10.79,-15.09,90.62,32.0
5,SEA,13.87,-15.0,100.0,4.0
