In [23]:
# import libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

In [24]:
data = pd.read_csv("merge_df_fl_num.csv", sep=',', index_col=0)
data.head(3)

Unnamed: 0,fl_date,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,carrier,origin,destination,distance,crs_dep_time,crs_arr_time,...,day_of_week,day_of_month,month,arr_delay,binary_delay,avg_month_payload_carrier,avg_month_psngr_carrier,avg_domest_cost_month_carrier,avg_domest_gallons_month_carrier,avg_dep_scheduled_monthly_airport
0,2018-06-19,1673,14771,14747,Alaska Airlines,"San Francisco, CA","Seattle, WA",679,1830,2035,...,1,19,6,2,1,553850.0,3591.942804,80408619.0,43054527.0,16430
1,2019-06-21,1138,14771,14100,Alaska Airlines,"San Francisco, CA","Philadelphia, PA",2521,2205,645,...,4,21,6,-20,0,553850.0,3591.942804,80408619.0,43054527.0,16430
2,2018-06-03,1743,14771,14747,Alaska Airlines,"San Francisco, CA","Seattle, WA",679,800,1005,...,6,3,6,2,1,553850.0,3591.942804,80408619.0,43054527.0,16430


In [25]:
# save the columns in case we need them 
date, carrier, origin, destination = data[['fl_date', 'carrier', 'origin', 'destination']]


In [26]:
# data with categorical data dropped
data_dropped = data.copy()
data_dropped.drop(labels=['fl_date', 'mkt_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'avg_domest_cost_month_carrier', 'avg_domest_gallons_month_carrier', 'taxi_out', 'origin', 'destination', 'binary_delay'], axis=1, inplace = True)

# get target column
y = data_dropped.arr_delay
data_dropped.drop('arr_delay', inplace=True, axis = 1)
data_dropped = pd.get_dummies(data_dropped)

data_dropped['crs_dep_time'] = pd.qcut(data_dropped['crs_dep_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])
data_dropped['crs_arr_time'] = pd.qcut(data_dropped['crs_arr_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])
data_dropped.columns
data_dropped.head(3)

Unnamed: 0,distance,crs_dep_time,crs_arr_time,day_of_week,day_of_month,month,avg_month_payload_carrier,avg_month_psngr_carrier,avg_dep_scheduled_monthly_airport,carrier_Alaska Airlines,...,carrier_Delta Airlines,carrier_ExpressJet,carrier_Frontier Airlines,carrier_JetBlue Airways,carrier_Republic Airways,carrier_SkyWest Airlines,carrier_Southwest Airlines,carrier_Spirit Airlines,carrier_United Airlines,carrier_ZW
0,679,20,20,1,19,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0
1,2521,24,2,4,21,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0
2,679,5,5,6,3,6,553850.0,3591.942804,16430,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# split data
X_train, X_test, y_train, y_test = train_test_split(data_dropped, y, test_size=0.3, random_state = 4)

In [28]:
# scale data
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit(X_test)

In [29]:
clf = svm.LinearSVC(random_state=0, dual =False)
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)
print(y_pred)

[-31 -16 -24 ...  -9 -11 -10]


In [31]:
# use metrics to evaluate model
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
print(f'r2_score: {r2_score(y_test, y_pred)}')

Mean Squared Error: 477.0317506554034
Mean Absolute Error: 16.08855228662977
r2_score: -0.3942824483266423


In [32]:
# data with dummy carriers
data_dummies = data.copy()
data_dummies.drop(labels=['fl_date', 'origin', 'destination'], axis = 1, inplace = True)
data_dummies = pd.get_dummies(data_dummies)

In [33]:
y = data_dummies.arr_delay
data_dummies.drop('arr_delay', inplace=True, axis = 1)

In [34]:
# split data
X_train, X_test, y_train, y_test = train_test_split(data_dropped, y, test_size=0.3, random_state = 4)

In [35]:
# scale data
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit(X_test)

In [36]:
clf = svm.LinearSVC(random_state=0, dual =False)
clf.fit(X_train, y_train)

In [37]:
y_pred = clf.predict(X_test)

In [38]:
# use metrics to evaluate model
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
print(f'r2_score: {r2_score(y_test, y_pred)}')

Mean Squared Error: 477.0317506554034
Mean Absolute Error: 16.08855228662977
r2_score: -0.3942824483266423
