In [1]:
# display the records in the file data/flights.csv, first 10 rows
import pandas as pd
df = pd.read_csv('data/flights.csv')
df.head(5)


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,DestAirportName,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,John F. Kennedy International,New York,NY,1539,4,0.0,1824,13,0,0
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,Chicago Midway International,Chicago,IL,710,3,0.0,740,22,1,0
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,Ronald Reagan Washington National,Washington,DC,810,-3,0.0,1614,-7,0,0
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,Cleveland-Hopkins International,Cleveland,OH,804,35,1.0,1027,33,1,0
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,Hartsfield-Jackson Atlanta International,Atlanta,GA,545,-1,0.0,728,-9,0,0


In [3]:
# clean the data by replacing empty or null values with 0
df.fillna(0, inplace=True)
df.head(5)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,DestAirportName,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,John F. Kennedy International,New York,NY,1539,4,0.0,1824,13,0,0
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,Chicago Midway International,Chicago,IL,710,3,0.0,740,22,1,0
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,Ronald Reagan Washington National,Washington,DC,810,-3,0.0,1614,-7,0,0
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,Cleveland-Hopkins International,Cleveland,OH,804,35,1.0,1027,33,1,0
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,Hartsfield-Jackson Atlanta International,Atlanta,GA,545,-1,0.0,728,-9,0,0


In [4]:
# convert categorical data to numerical data
df = pd.get_dummies(df)
df.head(5)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,CRSDepTime,DepDelay,DepDel15,CRSArrTime,...,DestState_OR,DestState_PA,DestState_PR,DestState_RI,DestState_TN,DestState_TX,DestState_UT,DestState_VA,DestState_WA,DestState_WI
0,2013,9,16,1,15304,12478,1539,4,0.0,1824,...,False,False,False,False,False,False,False,False,False,False
1,2013,9,23,1,14122,13232,710,3,0.0,740,...,False,False,False,False,False,False,False,False,False,False
2,2013,9,7,6,14747,11278,810,-3,0.0,1614,...,False,False,False,False,False,False,False,False,False,False
3,2013,7,22,1,13930,11042,804,35,1.0,1027,...,False,False,False,False,False,False,False,False,False,False
4,2013,5,16,4,13931,10397,545,-1,0.0,728,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# extract the columns that we use to make the prediction (ArrDel15 is the output column)
X = df.drop('ArrDel15', axis=1)
y = df['ArrDel15']



In [6]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# scale the X_train and X_test data
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


In [None]:
# train the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [8]:
# make predictions
y_pred = model.predict(X_test)


In [9]:
# evaluate the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))


1.0


In [10]:
# print the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[42766     0]
 [    0 11622]]


In [12]:
# save the model to a file for use in an external application
import joblib
joblib.dump(model, 'model.pkl')
print('Model saved to model.pkl')


Model saved to model.pkl
