In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv('flight_df_t.csv')
df.drop(columns=df.columns[0], axis=1, inplace=True)
df = df[df['arr_delay'] < 200]

In [28]:
df['crs_dep_time'] = pd.qcut(df['crs_dep_time'], 24, labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])

In [30]:
df.columns

Index(['fl_date', 'mkt_carrier_fl_num', 'origin_airport_id', 'dest_airport_id',
       'carrier', 'origin', 'destination', 'distance', 'crs_dep_time',
       'crs_arr_time', 'day_of_week', 'day_of_month', 'month', 'arr_delay',
       'binary_delay', 'avg_month_payload_carrier', 'avg_month_psngr_carrier',
       'avg_domest_cost_month_carrier', 'avg_domest_gallons_month_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport'],
      dtype='object')

## Data Sets

Numerical

In [34]:
numerical_test_data = df[['distance', 'crs_dep_time', 'day_of_week', 'day_of_month', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport']]

Categorical

In [36]:
carrier_dummies = pd.get_dummies(df['carrier'], prefix='carrier', drop_first=True)
destination_dummies = pd.get_dummies(df['destination'], prefix='dest', drop_first=True)
origin_dummies = pd.get_dummies(df['origin'], prefix='origin', drop_first=True)

In [41]:
df_with_carrier_dummies = pd.concat([df, carrier_dummies], axis=1)
df_with_dest_dummies = pd.concat([df, destination_dummies], axis=1)
df_with_origin_dummies = pd.concat([df, origin_dummies], axis=1)

Weather

In [51]:
chicago_weather = pd.read_csv('chicago_2019_weather.csv', index_col=0)

In [53]:
chicago_df = df[df['origin'] == 'Chicago, IL']
chicago_df = df[df['fl_date'] > '2018-12-31']

In [54]:
chicago_weather_df = chicago_df.merge(right=chicago_weather, how='inner')

In [74]:
chicago_weather_df.columns
chicago = chicago_weather_df[['distance', 'arr_delay', 'crs_dep_time', 'day_of_week', 'day_of_month', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport', 'weather']]

In [75]:
chicago['weather'] = chicago.weather.replace({'sunny': 0,
                                              'cloudy': 1,
                                              'rainy': 2,
                                              'snow': 3
                                          })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chicago['weather'] = chicago.weather.replace({'sunny': 0,


Chicago Weather with Categorical

In [134]:
chicago_with_carrier = chicago_weather_df[['carrier', 'distance', 'arr_delay', 'crs_dep_time', 'day_of_week', 'day_of_month', 'month', 'avg_month_psngr_carrier',
       'avg_dep_scheduled_monthly_airport', 'avg_monthly_psngr_airport', 'weather']]

In [151]:
chicago_with_carrier.head(1)
chicago_with_carrier['weather'] = chicago_with_carrier.weather.replace({'sunny': 0,
                                              'cloudy': 1,
                                              'rainy': 2,
                                              'snow': 3
                                          })

In [152]:
chicago_carrier_dummies = pd.get_dummies(chicago_with_carrier['carrier'], prefix='carrier', drop_first=True)
chicago_dummies = pd.concat([chicago_with_carrier, chicago_carrier_dummies], axis=1)

In [153]:
chicago_dummies.drop('carrier', axis=1, inplace=True)

In [154]:
carrier_y = chicago_dummies['arr_delay']
carrier_X = chicago_dummies.drop('arr_delay', axis=1)

Unnamed: 0,distance,arr_delay,crs_dep_time,day_of_week,day_of_month,month,avg_month_psngr_carrier,avg_dep_scheduled_monthly_airport,avg_monthly_psngr_airport,weather
0,416,24,8,0,18,3,1624.565831,35220,3171023,1


In [155]:
scaler2 = StandardScaler()
scaled_X_carrier = pd.DataFrame(scaler2.fit_transform(carrier_X))
scaled_X_carrier.columns = carrier_X.columns

In [157]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X_carrier, carrier_y, test_size=0.30)

# Models

In [184]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [185]:
y = chicago['arr_delay']
X = chicago.drop('arr_delay', axis=1)

In [186]:
scaler = StandardScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X))
scaled_X.columns = X.columns

In [187]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.30)

In [84]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [85]:
y_pred = dtr.predict(X_test)

In [89]:
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1998.1469011725294
-1.0214893407681798


# GridSearch

In [188]:
from sklearn.model_selection import GridSearchCV
import datetime, warnings, scipy
warnings.filterwarnings("ignore")

In [189]:
tree_reg = DecisionTreeRegressor()

In [190]:
parameters={"splitter":["best","random"],
            "max_depth" : [10,20,30,40],
           "min_samples_leaf":[5,6,7,8,9,10],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[5,6,7]}

In [191]:
grid = GridSearchCV(estimator=tree_reg,  param_grid=parameters, n_jobs=-1)

In [192]:
grid.fit(X_train, y_train)

In [193]:
y_pred = grid.predict(X_test)

In [194]:
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1042.325006160966
20.39834998519663
0.02050726573105166


In [195]:
print(grid.best_estimator_)

DecisionTreeRegressor(max_depth=10, max_features='auto', max_leaf_nodes=7,
                      min_samples_leaf=9)


# Linear Regression

In [132]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet