In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.scorer import make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.utils import check_X_y
from sklearn.tree import export_graphviz

from sklearn.metrics import cohen_kappa_score, make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from time import time

pd.set_option('display.max_columns', 32)

In [None]:
df = pd.read_csv('./flights_clean.csv', low_memory=False)

# III. Methodology

In [None]:
df.head()

### III.1 Data Preprocessing

Check the jupyter notebook titled II - Analysis to know how the data was preprocessed

### III.2 Implementation

In order to test the trained models it is necessary to prepare the data for training and testing, it will be used a split for test and train, then a dummy classifier will be trained and tested to have a benchmark to compare the results.

In [None]:
df_sample = df[df['WEEK'] < 10]
df_sample.describe()

##### Preparing the data for training and testing

In [None]:
X = df_sample.drop(columns=["LABELS"])
y = df_sample.drop(columns=["DAY_OF_WEEK", "AIRLINE", "FLIGHT_NUMBER", "TAIL_NUMBER", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "SCHEDULED_DEPARTURE", "SCHEDULED_TIME", "DISTANCE", "SCHEDULED_ARRIVAL", "WEEK"])

In [None]:
y = y.iloc[:,0]

##### Training a Dummy Classifier

In [None]:
X = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print "The training dataset contains {} datapoints, while the testing dataset contains {} datapoints.".format(X_train.shape[0], X_test.shape[0])
print "The training dataset contains {} labels, while the testing dataset contains {} labels.".format(y_train.shape[0], y_test.shape[0])

In [None]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

In [None]:
dummy.score(X_test, y_test)

In [None]:
y_pred_dummy = dummy.predict(X_test)

In [None]:
print "F1 score for set: {:.4f}".format(f1_score(y_test, y_pred_dummy, pos_label="NOT DELAYED"))
print "Kappa score for set {:.4f}".format(cohen_kappa_score(y_test, y_pred_dummy))

##### Selecting the best model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print "Trained model in {:.4f} seconds".format(end - start)

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print the results
    print "Made predictions in {:.4f} seconds.".format(end - start)
    print "F1 score for set: {:.4f}.".format(f1_score(target.values, y_pred, pos_label="NOT DELAYED"))
    print "Kappa score for set: {:.4f}".format(cohen_kappa_score(target.values, y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "====== Training Set ======"
    predict_labels(clf, X_train, y_train)
    print "====== Testing Set ======"
    predict_labels(clf, X_test, y_test)

In [None]:
# Initialize the models
clf_dt = DecisionTreeClassifier(random_state=3)
clf_rf = RandomForestClassifier(random_state=3)

# Set up the training set sizes
X_train_10k = X_train[:10000]
y_train_10k = y_train[:10000]

X_train_20k = X_train[:20000]
y_train_20k = y_train[:20000]

X_train_30k = X_train[:30000]
y_train_30k = y_train[:30000]

# First classifier: Decision Tree
train_predict(clf_dt, X_train_10k, y_train_10k, X_test, y_test)
train_predict(clf_dt, X_train_20k, y_train_20k, X_test, y_test)
train_predict(clf_dt, X_train_30k, y_train_30k, X_test, y_test)
# Second classifier: Random Forest
train_predict(clf_rf, X_train_10k, y_train_10k, X_test, y_test)
train_predict(clf_rf, X_train_20k, y_train_20k, X_test, y_test)
train_predict(clf_rf, X_train_30k, y_train_30k, X_test, y_test)

** Classifer 1 - Decision Tree**  

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) | Kappa Score (train) | Kappa Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: | :--------------: | :-------------: |
| 10000               | 2.5820                  | 44.3490                 | 1.0000           | 0.6723      | 1.0000           | 0.1476      |
| 20000               | 7.8360                  | 43.9529                 | 1.0000           | 0.6804      | 1.0000           | 0.1660      |
| 30000               | 12.9322                 | 44.2589                 | 1.0000           | 0.6885      | 1.0000           | 0.1814      |

** Classifer 2 - Random Forest**  

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) | Kappa Score (train) | Kappa Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: | :--------------: | :-------------: |
| 10000               | 3.5879                  | 51.3358                | 0.9903           | 0.6988       | 0.9750           | 0.1887
| 20000               | 8.2012                  | 49.9041                | 0.9908           | 0.6998       | 0.9766           | 0.2013
| 30000               | 13.3635                 | 51.3536                | 0.9898           | 0.7007       | 0.9741           | 0.2114

In [None]:
X_train_100k = X_train[:100000]
y_train_100k = y_train[:100000]

In [None]:
train_predict(clf_rf, X_train_100k, y_train_100k, X_test, y_test)

##### Tunning the model to improve results

In [None]:
kappa_scorer = make_scorer(cohen_kappa_score)

In [None]:
rf_parameters = {
    'max_depth' : [10, 20, 50, 100, 300],
    'n_estimators' : [10, 20, 50, 100, 300]
    }

In [None]:
# Perform grid search on the classifier
rsearch_rf = RandomizedSearchCV(clf_rf, rf_parameters, 2, kappa_scorer)

# Fit the grid search object to the training data and find the optimal parameters
rsearch_rf = rsearch_rf.fit(X_train_30k, y_train_30k)

# Get the estimator
clf_rf = rsearch_rf.best_estimator_

# Show the best parameters
print rsearch_rf.best_params_
print "====== Training Set ======"
predict_labels(clf_rf, X_train_30k, y_train_30k)
print "====== Testing Set ======"
predict_labels(clf_rf, X_test, y_test)

##### Exporting a visualization for the first tree in the Random Forest

In [None]:
export_graphviz(clf_rf.estimators_[0], out_file='example_tree.dot')

In [None]:
from subprocess import call

In [None]:
call(['dot', '-Tpng', 'example_tree.dot', '-o', 'example_tree.png', '-Gdpi=600'])

##### Tunning the not selected model to compare results

In [None]:
dt_parameters = {
    'max_depth' : [10, 20, 50, 100, 300]
}

In [None]:
# Perform grid search on the classifier
rsearch_dt = RandomizedSearchCV(clf_dt, dt_parameters, 1, kappa_scorer)

# Fit the grid search object to the training data and find the optimal parameters
rsearch_dt = rsearch_dt.fit(X_train_30k, y_train_30k)

# Get the estimator
clf_dt = rsearch_dt.best_estimator_

# Show the best parameters
print rsearch_dt.best_params_
print "====== Training Set ======"
predict_labels(clf_dt, X_train_30k, y_train_30k)
print "====== Testing Set ======"
predict_labels(clf_dt, X_test, y_test)