In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve

In [2]:
typy = {'MONTH': np.int8, 'DAY_OF_MONTH': np.int8, 'DAY_OF_WEEK': np.int8, 'FL_DATE': 'category', 'OP_UNIQUE_CARRIER': 'category',
        'TAIL_NUM': 'category', 'OP_CARRIER_FL_NUM': 'category',  'ORIGIN': 'category', 'DEST': 'category', 'CRS_DEP_TIME':np.int16, 'DEP_DELAY':np.float16,
        'CRS_ARR_TIME': np.int16, 'ARR_DELAY': np.float16, 'DISTANCE': np.int16}
kolumnyczas = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK','FL_DATE','OP_UNIQUE_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM',
               'ORIGIN','DEST','CRS_DEP_TIME', 'DEP_DELAY','CRS_ARR_TIME', 'ARR_DELAY', 'DISTANCE']
days_to_rename = {1: 'Poniedzialek',
                  2: 'Wtorek',
                  3: 'Sroda',
                  4: 'Czwartek',
                  5: 'Piatek',
                  6: 'Sobota',
                  7: 'Niedziela'}
months_to_rename = {1: 'Styczen',
                    2: 'Luty',
                    3: 'Marzec',
                    4: 'Kwiecien',
                    5: 'Maj',
                    6: 'Czerwiec',
                    7: 'Lipiec',
                    8: 'Sierpien',
                    9: 'Wrzesien',
                    10: 'Pazdziernik',
                    11: 'Listopad',
                    12: 'Grudzien'}

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [5]:

df = pd.read_csv('new_base.csv', usecols=kolumnyczas, dtype=typy)
print('Memory usage: {:.1f}'.format(df.memory_usage().sum() / 1024**2))

Memory usage: 128.3


In [6]:
df.head(2)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_DELAY,DISTANCE
0,1,1,7,2017-01-01,AA,N787AA,1,JFK,LAX,800,31.0,1142,27.0,2475
1,1,2,1,2017-01-02,AA,N788AA,1,JFK,LAX,800,-3.0,1142,12.0,2475


In [7]:
df_sample = df.sample(n=10000)

In [9]:
df_sample.to_csv('sample_data.csv')

In [7]:
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], format="%Y/%m/%d")
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].map(days_to_rename)
df['MONTH'] = df['MONTH'].map(months_to_rename)
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('category') 
df['MONTH'] = df['MONTH'].astype('category') 
df['DELAY_CAT'] = df['DEP_DELAY'].apply(lambda x: 'brak opoznienia' if x < 10 else '< 60' if x >= 11 and x < 60  else '<60, 120>' if x >= 60 and x < 120  else '> 120')
df['DELAY_CAT'] =  df['DELAY_CAT'].astype('category')
df['DELAYED'] = (df['DEP_DELAY'] > 10).astype(np.int8)
df['CRS_DEP_TIME_CAT'] = pd.cut(x=df['CRS_DEP_TIME'], bins=24, labels=np.arange(24)).astype(np.int8)
df['CRS_ARR_TIME_CAT'] = pd.cut(x=df['CRS_ARR_TIME'], bins=24, labels=np.arange(24)).astype(np.int8)

In [8]:
def count_flights(dep_or_arr, origin_or_dest, new_name):
    group = df.groupby(['FL_DATE', dep_or_arr, origin_or_dest]).size().to_frame().reset_index()
    group.rename(columns={0: new_name}, inplace=True)
    return group

In [9]:
## powinno byc CRS_DEP_TIME, nie CAT //
def previous_arr():
    df_sorted = df.sort_values(by=['FL_DATE', 'CRS_DEP_TIME_CAT'])
    pause = df_sorted.groupby(['TAIL_NUM', 'FL_DATE'])['CRS_ARR_TIME_CAT'].shift(1).to_frame()
    pause.rename(columns={'CRS_ARR_TIME_CAT': 'PREVIOUS_ARR'}, inplace=True)
    return pause

In [10]:
def count_delayed():
    df_sorted = df.sort_values(by=['FL_DATE', 'CRS_DEP_TIME'])
    rolling = df_sorted.groupby(['OP_CARRIER_FL_NUM', 'CRS_DEP_TIME'])['DELAYED'].rolling(3).sum().to_frame()
    rolling.index = rolling.index.droplevel([0,1])
    rolling.rename(columns={'DELAYED': 'ROLLING_DELAYED'}, inplace=True)
    return rolling

In [11]:
def delay_time_cat():
    df_sorted = df.sort_values(by=['FL_DATE', 'CRS_DEP_TIME_CAT'])
    gr = df_sorted.groupby(['FL_DATE','ORIGIN', 'CRS_DEP_TIME_CAT'])['DELAYED'].rolling(1).sum().to_frame().reset_index()
    return gr

In [12]:
num_of_dep = count_flights('CRS_DEP_TIME_CAT', 'ORIGIN', 'NUM_OF_DEP')
df = df.merge(num_of_dep, left_on=['FL_DATE', 'CRS_DEP_TIME_CAT', 'ORIGIN'], right_on=['FL_DATE', 'CRS_DEP_TIME_CAT', 'ORIGIN'])
df['NUM_OF_DEP'] = df['NUM_OF_DEP'].astype(np.int8)

In [13]:
num_of_arr = count_flights('CRS_ARR_TIME_CAT', 'DEST', 'NUM_OF_ARR')
df = df.merge(num_of_arr, left_on=['FL_DATE', 'CRS_ARR_TIME_CAT', 'DEST'], right_on=['FL_DATE', 'CRS_ARR_TIME_CAT', 'DEST'])
df['NUM_OF_ARR'] = df['NUM_OF_ARR'].astype(np.int8)

In [14]:
pause_time = previous_arr()
df = df.merge(pause_time, left_index=True, right_index=True, how='left')

In [15]:
df['PAUSE_TIME'] = df['CRS_DEP_TIME_CAT'] - df['PREVIOUS_ARR']
df['PAUSE_CAT'] = df['PAUSE_TIME'].fillna(50)
df['PAUSE_CAT'] = df['PAUSE_TIME'].apply(lambda x: '-1' if x < 0 else '1' if x >= 0 and x < 2 else '2' if x >= 2 and x < 3 else '3' if x >= 3 and x < 4 else '4-6' if x >= 4 and x < 6 else '6+' if x >= 6 and x < 25 else  'dzień')
df['PAUSE_CAT'] = df['PAUSE_CAT'].astype('category')
df['CRS_DEP_TIME_CAT'] = df['CRS_DEP_TIME_CAT'].astype('category')
df['CRS_ARR_TIME_CAT'] = df['CRS_ARR_TIME_CAT'].astype('category')
#df.drop('PREVIOUS_ARR', inplace=True, axis=1)
#df.drop('PAUSE_TIME', inplace=True, axis=1)

In [16]:
roll = count_delayed()
df = df.merge(roll, left_index=True, right_index=True)
df['ROLLING_DELAYED'].fillna('-1', inplace=True)
df['ROLLING_DELAYED'] = df['ROLLING_DELAYED'].astype('category') 

In [17]:
df.to_csv('data_vis.csv')

In [17]:
columns_df = df.columns
df = df[df['PAUSE_CAT'] != '-1'].values
df = pd.DataFrame(df, columns=columns_df)
df = df[df['ROLLING_DELAYED'] != '-1'].values
df = pd.DataFrame(df, columns=columns_df)
df.shape

(4990099, 24)

In [18]:
df['MONTH'] = df['MONTH'].astype('category')
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('category')
df['OP_UNIQUE_CARRIER'] = df['OP_UNIQUE_CARRIER'].astype('category')
df['TAIL_NUM'] = df['TAIL_NUM'].astype('category')
df['OP_CARRIER_FL_NUM'] = df['OP_CARRIER_FL_NUM'].astype('category')
df['ORIGIN'] = df['ORIGIN'].astype('category')
df['DEST'] = df['DEST'].astype('category')
df['DELAY_CAT'] = df['DELAY_CAT'].astype('category')
df['DELAYED'] = df['DELAYED'].astype(np.int8)
df['CRS_DEP_TIME_CAT'] = df['CRS_DEP_TIME_CAT'].astype('category')
df['CRS_ARR_TIME_CAT'] = df['CRS_ARR_TIME_CAT'].astype('category')
df['PAUSE_CAT'] = df['PAUSE_CAT'].astype('category')
df['ROLLING_DELAYED'] = df['ROLLING_DELAYED'].astype('category')
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype(np.int8)
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].astype(np.int16)
df['DEP_DELAY'] = df['DEP_DELAY'].astype(np.float16)
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].astype(np.int16)
df['ARR_DELAY'] = df['ARR_DELAY'].astype(np.float16)
df['DISTANCE'] = df['DISTANCE'].astype(np.int16)
df['NUM_OF_DEP'] = df['NUM_OF_DEP'].astype(np.int8)
df['NUM_OF_ARR'] = df['NUM_OF_ARR'].astype(np.int8)
df['PREVIOUS_ARR'] = df['PREVIOUS_ARR'].astype(np.float64)
df['PAUSE_TIME'] =df['PAUSE_TIME'].astype(np.float64)
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], format="%Y/%m/%d")

In [20]:
df.to_csv('data_forecast.csv')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4990099 entries, 0 to 4990098
Data columns (total 24 columns):
MONTH                category
DAY_OF_MONTH         int8
DAY_OF_WEEK          category
FL_DATE              datetime64[ns]
OP_UNIQUE_CARRIER    category
TAIL_NUM             category
OP_CARRIER_FL_NUM    category
ORIGIN               category
DEST                 category
CRS_DEP_TIME         int16
DEP_DELAY            float16
CRS_ARR_TIME         int16
ARR_DELAY            float16
DISTANCE             int16
DELAY_CAT            category
DELAYED              int8
CRS_DEP_TIME_CAT     category
CRS_ARR_TIME_CAT     category
NUM_OF_DEP           int8
NUM_OF_ARR           int8
PREVIOUS_ARR         float64
PAUSE_TIME           float64
PAUSE_CAT            category
ROLLING_DELAYED      category
dtypes: category(12), datetime64[ns](1), float16(2), float64(2), int16(3), int8(4)
memory usage: 257.6 MB
