In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

flights_df = pd.read_csv('M1_final.csv')
flights_df['Dew Point'] = flights_df['Dew Point'].astype('int64')

In [2]:
pd.set_option('display.max_columns', None)

flights_df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,DEST,DEP_DELAY,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,DEP_TIME_M,CRS_ARR_M,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,TAXI_OUT
0,11,1,5,B6,N828JB,CHS,-1,124,636,324,323,448,48,34,58,W,25,38,29.86,Fair / Windy,9,17,14
1,11,1,5,B6,N992JB,LAX,-7,371,2475,340,333,531,48,34,58,W,25,38,29.86,Fair / Windy,9,17,15
2,11,1,5,B6,N959JB,FLL,40,181,1069,301,341,482,48,34,58,W,25,38,29.86,Fair / Windy,9,17,22
3,11,1,5,B6,N999JQ,MCO,-2,168,944,345,343,513,48,34,58,W,25,38,29.86,Fair / Windy,9,17,12
4,11,1,5,DL,N880DN,ATL,-4,139,760,360,356,499,46,32,58,W,24,35,29.91,Fair / Windy,9,17,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28815,1,31,5,B6,N216JB,ORH,2,57,150,1370,1372,1427,39,38,96,N,6,0,30.18,Cloudy,20,32,19
28816,1,31,5,AA,N104NN,BOS,2,75,187,1390,1392,25,39,38,96,N,6,0,30.18,Cloudy,19,23,22
28817,1,31,5,AS,N581AS,SEA,283,392,2422,1125,1408,1337,39,38,96,N,6,0,30.18,Cloudy,19,23,21
28818,1,31,5,B6,N957JB,SJU,5,224,1598,1417,1422,261,39,38,96,N,6,0,30.18,Cloudy,19,23,13


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def find_year(row):
    if row == 1:
        return 2020
    return 2019

def remove_windy(row):
    string = ' / Windy'
    if row[-8:] == string:
        return row[:-8]
    return row

datetime_dict = {'YEAR': 'year', 'MONTH': 'month', 'DAY_OF_MONTH': 'day'}
week_dict = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}
test_df = flights_df.copy()
year = test_df['MONTH'].apply(find_year)
test_df.insert(0, 'YEAR', year)
date = pd.to_datetime(test_df[['YEAR', 'MONTH', 'DAY_OF_MONTH']].rename(columns=datetime_dict))
test_df.insert(0, 'DATE', date)
test_df = test_df.drop(['YEAR', 'MONTH', 'DAY_OF_MONTH'], axis=1)
test_df['DAY_OF_WEEK'] = test_df['DAY_OF_WEEK'].replace(week_dict)
test_df['Condition'] = test_df['Condition'].apply(remove_windy)
test_df = test_df.fillna('CALM')
week_encoded = pd.get_dummies(test_df['DAY_OF_WEEK'], dtype='int64').drop(['Mon'], axis=1)
carrier_encoded = pd.get_dummies(test_df['OP_UNIQUE_CARRIER'], dtype='int64').drop(['B6'], axis=1)
dest_encoded = pd.get_dummies(test_df['DEST'], dtype='int64').drop(['LAX'], axis=1)
wind_encoded = pd.get_dummies(test_df['Wind'], dtype='int64').drop(['CALM'], axis=1)
condition_encoded = pd.get_dummies(test_df['Condition'], dtype='int64').drop(['Fair'], axis=1)
test_df = pd.concat([test_df, week_encoded, carrier_encoded, dest_encoded, condition_encoded, wind_encoded], axis=1)
test_df.loc[test_df.Humidity == 0, 'Dew Point'] = 46

regression_df = test_df[~test_df.Humidity.isin([0,10])]

X = np.array(regression_df[['Temperature', 'Dew Point']])
y = np.array(regression_df['Humidity'])

poly = PolynomialFeatures(degree=3)
poly_X = poly.fit_transform(X)

model = LinearRegression()
model.fit(poly_X, y)

humidity_outliers = test_df[test_df.Humidity.isin([0,10])]

X = np.array(humidity_outliers[['Temperature', 'Dew Point']])
poly_X = poly.transform(X)
y = np.around(model.predict(poly_X))

test_df.loc[test_df.Humidity.isin([0,10]), 'Humidity'] = y

In [4]:
regression_df = test_df.loc[test_df.Humidity != 0]

X = np.array(regression_df.loc(axis=1)['Temperature', 'Pressure', 'Wind Speed', *wind_encoded.columns])
y = np.array(regression_df['Dew Point'])

model = LinearRegression()
model.fit(X, y)

X = np.array(regression_df.loc(axis=1)['Temperature', 'Pressure', 'Wind Speed', *wind_encoded.columns])
y_pred = model.predict(X)

# sns.residplot(x=y, y=y_pred, scatter_kws={"s": 10})

In [5]:
flights_df.corr(numeric_only=True)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DELAY,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,DEP_TIME_M,CRS_ARR_M,Temperature,Dew Point,Humidity,Wind Speed,Wind Gust,Pressure,sch_dep,sch_arr,TAXI_OUT
MONTH,1.0,-0.007254,0.029365,0.046455,-0.016052,-0.000144,0.001824,0.002632,-0.012361,0.135682,0.13504,0.012515,-0.035097,-0.051014,-0.085274,-0.006191,-0.019705,0.018505
DAY_OF_MONTH,-0.007254,1.0,-0.029732,-0.036618,0.002761,-0.002762,0.001754,0.004446,0.008519,-0.132457,-0.040129,0.096215,-0.093211,-0.088991,0.016871,-0.003417,0.002223,-0.02399
DAY_OF_WEEK,0.029365,-0.029732,1.0,-0.006203,0.000413,0.00245,0.005411,0.008256,-0.003339,0.013081,-0.073816,0.001627,0.095943,0.127689,0.102289,0.024415,0.009621,0.028388
DEP_DELAY,0.046455,-0.036618,-0.006203,1.0,-0.030871,-0.031507,0.102384,0.07234,0.043691,-0.032261,-1.3e-05,-0.011776,0.067718,0.04434,-0.064939,-0.065452,-0.006949,0.034881
CRS_ELAPSED_TIME,-0.016052,0.002761,0.000413,-0.030871,1.0,0.994465,-0.031332,-0.021391,0.035186,-0.01605,0.007246,0.017198,-0.01344,-0.011934,0.010165,0.083426,-0.067669,0.070838
DISTANCE,-0.000144,-0.002762,0.00245,-0.031507,0.994465,1.0,-0.040963,-0.034604,0.00852,-0.009923,0.009789,0.01511,-0.010734,-0.009522,0.005692,0.059174,-0.088793,0.059856
CRS_DEP_M,0.001824,0.001754,0.005411,0.102384,-0.031332,-0.040963,1.0,0.946023,0.452476,0.086371,0.001578,0.003843,0.008063,-0.009887,-0.008285,-0.025621,0.495286,0.044864
DEP_TIME_M,0.002632,0.004446,0.008256,0.07234,-0.021391,-0.034604,0.946023,1.0,0.469758,0.098105,0.008907,-0.005923,0.006033,-0.014336,-0.00941,0.025609,0.544197,0.057896
CRS_ARR_M,-0.012361,0.008519,-0.003339,0.043691,0.035186,0.00852,0.452476,0.469758,1.0,0.10162,-0.000442,-0.033163,0.01418,0.004486,-0.012682,0.149903,0.424209,0.071377
Temperature,0.135682,-0.132457,0.013081,-0.032261,-0.01605,-0.009923,0.086371,0.098105,0.10162,1.0,0.787864,0.054401,0.010603,-0.039487,-0.37673,-0.022534,0.075958,-0.067574


In [6]:
from sklearn.preprocessing import StandardScaler, PowerTransformer

feature_df = test_df.drop([
    'DATE',
    'DAY_OF_WEEK',
    'OP_UNIQUE_CARRIER',
    'TAIL_NUM',
    'DEST',
    'DISTANCE',
    'CRS_DEP_M',
    'CRS_ARR_M',
    'Wind',
    'Condition',
    'TAXI_OUT'
], axis=1)

target_df = test_df['TAXI_OUT']

X_transformer = PowerTransformer()
y_transformer = StandardScaler()

X = np.array(feature_df)
y = np.array(target_df).reshape(-1, 1)

X_transformed = np.hstack((X_transformer.fit_transform(feature_df.iloc[:,:7]), feature_df.iloc[:,7:]))
y_transformed = y_transformer.fit_transform(y)

In [7]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_transformed, y_transformed)
linear_regressor.score(X_transformed, y_transformed)

0.1520660884184244