In [None]:
"""
Build supervise learning model to predict airline delay
"""

__author__ = 'dimple sharma'

# Importing modules required for implementation
import warnings
warnings.filterwarnings('ignore')


import sys
import random
import numpy as np

from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
#from sklearn.preprocessing import StandardScaler
from datetime import date

import pandas as pd
import matplotlib.pyplot as plt
#import csv
#%matplotlib inline  #Allows you to execute matplotlib graphs inline in iPython Notebook

#'data/airline/2007.csv'
FILENAME_TRAIN = '/Users/chaitanyasharma/Documents/dimple/Python_Scripts/data/airline/2007.csv'
FILENAME_TEST = '/Users/chaitanyasharma/Documents/dimple/Python_Scripts/data/airline/2008.csv'
REQUIRED_COLS = ['DepDelay','Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'Distance', 'UniqueCarrier', 'Dest']
#REQUIRED_COLS = ['DepDelay','Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'Distance', 'UniqueCarrier', 'Dest']

def read_from_csv(filename):
    airline = pd.read_csv(filename, sep=',')
    print 'Successfully read airline data.'
    print 'Data type', type(airline)
    return airline

def data_get_info(data):
    try:
        print 'Number of rows and columns:', data.shape
        print data.info()
        print data.head()
        print data.columns.values
    except TypeError:
        print 'Please use data frame object from pandas.'

#class Util(object):
    #holidays = []
    #def __init__(self):
#Defines the dates of holiday in 2007 and 2008
holidays = [ date(2007, 1, 1), date(2007, 1, 15), date(2007, 2, 19), \
            date(2007, 5, 28), date(2007, 6, 7), date(2007, 7, 4), \
            date(2007, 9, 3), date(2007, 10, 8), date(2007, 11, 11), \
            date(2007, 11, 22), date(2007, 12, 25), \
            date(2008, 1, 1), date(2008, 1, 21), date(2008, 2, 18), \
            date(2008, 5, 22), date(2008, 5, 26), date(2008, 7, 4), \
            date(2008, 9, 1), date(2008, 10, 13), date(2008, 11, 11), \
            date(2008, 11, 27), date(2008, 12, 25) \
           ]

#Gets number of days from holidays
def days_from_nearest_holiday(self,row):
    d = date(row.Year, row.Month, row.DayofMonth)
    x = [(abs(d-h)).days for h in holidays]
    return min(x)

"""
1. Filter out flight that were canceled or that were not origination from ORD
2. Keep variables that we want to use in the analysis.
3. Generate output feature matrix.
"""
def preprocess(flight_data):
    flight_data_ord = pd.DataFrame()
    try:
        flight_data.head()
        flight_data_ord = flight_data[(flight_data.Cancelled == 0) & (flight_data.Origin == 'ORD')]
        flight_data_ord = flight_data_ord[REQUIRED_COLS].dropna()
        #Gets hour
        flight_data_ord['hour'] = flight_data_ord['CRSDepTime'].apply(lambda x: str(x).zfill(4)[:2])
        holiday = Util()
        #Get number of days from nearest holiday
        flight_data_ord['days_from_holiday'] = flight_data_ord.apply(holiday.days_from_nearest_holiday, axis=1)
        flight_data_ord.columns = [column.lower() for column in flight_data_ord.columns]
    except TypeError:
        print 'Invalid type! Takes data frame type object.'
    except ValueError:
        print 'Invalid type! Takes data frame type object.'
    return flight_data_ord

def main():
    # read 2007 year file
    # Column headers to read airline dataset
    cols = ['year', 'month', 'day', 'dow', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'Carrier', 'FlightNum',
        'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
        'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
        'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

    data = pd.DataFrame()
    data = read_from_csv(FILENAME_TRAIN)
    data_get_info(data)

    # Classify flights as delayed if a flight is delayed by 15 minutes
    df = pd.DataFrame()
    df = data[data['Origin'] == 'ORD'].dropna(subset=['DepDelay'])
    df['DepDelayed'] = df['DepDelay'].apply(lambda x: x>15)
    print 'Total flights:', str(df.shape[0])
    print 'Total delays:', str(df['DepDelayed'].sum())

    # Flight delays each month
    grouped = df[['DepDelayed', 'Month']].groupby(['Month']).mean()
    grouped.plot(kind='bar')

    # Compute average number of delayed flights by hour
    df['Hour'] = df['CRSDepTime'].map(lambda x: int(str(int(x)).zfill(4)[:2]))
    grouped = df[['DepDelayed', 'Hour']].groupby(['Hour']).mean()

    # plot average delays by hour of day
    grouped.plot(kind='bar')

    # Display graphs
    plt.show()
    del(df)       #Remove unused dataframe
    
    flight_data = preprocess(data)
    flight_data.head()
    del(data)       #Remove unused dataframe

    # Create training set and test set
    ANALYSIS_COLS = ['month', 'dayofmonth', 'dayofweek', 'hour', 'distance', 'days_from_holiday']
    train_y = flight_data['depdelay'] >= 15
    train_x = flight_data[ANALYSIS_COLS]

    data_test = read_from_csv(FILENAME_TEST)
    flight_test = preprocess(data_test)
    test_y = flight_test['depdelay'] >= 15
    test_x = flight_test[ANALYSIS_COLS]
    del(data_test)    #Remove unused dataframe 

    print train_x.columns
    print 'Training set predictor dimensions:', train_x.shape
    print 'Training set response dimension:', train_y.shape

    print test_x.columns
    print 'Test set predictor dimensions:',test_x.shape
    print 'Test set response dimension:',test_y.shape

    #Convert data type to float
    test_x = test_x.astype(float)
    print test_x.dtypes

    clf_lr = linear_model.LogisticRegression(penalty='l2', class_weight='auto')
    clf_lr.fit(train_x, train_y)

    # Predict output labels on test set
    pr = clf_lr.predict(test_x)

    # display evaluation metrics
    cm = confusion_matrix(test_y, pr)
    print("Confusion matrix")
    print(pd.DataFrame(cm))
    report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='micro')
    print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
            (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

if __name__ == '__main__':
    main()

Successfully read airline data.
Data type <class 'pandas.core.frame.DataFrame'>
Number of rows and columns: (7453215, 29)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7453215 entries, 0 to 7453214
Data columns (total 29 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               int64
TaxiOut              int64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         int64
WeatherDelay         int64
NASDelay             int64
SecurityDelay    