<a href="https://colab.research.google.com/github/Aribaldi/Mlinreg/blob/master/LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Clone repository

In [None]:
!git clone https://github.com/Aribaldi/Mlinreg
%cd Mlinreg

Download


In [None]:
START_YEAR = 2020
START_MONTH = 1
END_YEAR = 2020
END_MONTH = 3

In [None]:
import os

In [None]:
if not os.path.exists('data/'):
    os.makedirs('data/')

!sh utils/downloader.sh {START_MONTH} {START_YEAR} {END_MONTH} {END_YEAR}

Preprocess 


In [None]:
columns = {
    'YEAR' : 'Year', 
    'MONTH' : 'Month', 
    'DAY_OF_MONTH' : 'DayofMonth', 
    'DAY_OF_WEEK' : 'DayOfWeek', 
    'DEP_TIME' : 'DepTime', 
    'CRS_DEP_TIME' : 'CRSDepTime',
    'ARR_TIME' : 'ArrTime',
    'CRS_ARR_TIME' : 'CRSArrTime',
    'OP_UNIQUE_CARRIER' : 'UniqueCarrier',
    'OP_CARRIER_FL_NUM' : 'FlightNum', 
    'TAIL_NUM' : 'TailNum', 
    'ACTUAL_ELAPSED_TIME' : 'ActualElapsedTime', 
    'CRS_ELAPSED_TIME' : 'CRSElapsedTime', 
    'AIR_TIME' : 'AirTime',
    'ARR_DELAY' : 'ArrDelay',
    'DEP_DELAY' : 'DepDelay',
    'ORIGIN' : 'Origin', 
    'DEST' : 'Dest', 
    'DISTANCE' : 'Distance',
    'TAXI_IN' : 'TaxiIn',
    'TAXI_OUT' : 'TaxiOut', 
    'CANCELLED' : 'Cancelled', 
    'CANCELLATION_CODE' : 'CancellationCode', 
    'DIVERTED' : 'Diverted',
    'CARRIER_DELAY' : 'CarrierDelay', 
    'WEATHER_DELAY' : 'WeatherDelay', 
    'NAS_DELAY' : 'NASDelay', 
    'SECURITY_DELAY' : 'SecurityDelay',
    'LATE_AIRCRAFT_DELAY' : 'LateAircraftDelay'
}

In [None]:
import pandas as pd
import glob

In [None]:
extension = 'csv'
os.chdir('data')
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.rename(columns=columns, inplace=True)
combined_csv = combined_csv[list(columns.values())]
combined_csv.to_csv('raw.csv', index=False, encoding='utf-8-sig')
os.chdir('../')

In [None]:
df = pd.read_csv('data/raw.csv')

In [None]:
df

EDA

In [None]:
#TODO

Select features and split data

In [None]:
RANDOM_STATE = 0
TRAIN_NUM_SAMPLES = 1000
TEST_NUM_SAMPLES = 200

In [None]:
df = pd.read_csv('data/raw.csv')

In [None]:
features = ['Month', 'DayOfWeek', 'Distance']
target = 'DepDelay'

In [None]:
train = df[[*features, target]].dropna().sample(TRAIN_NUM_SAMPLES, random_state=RANDOM_STATE)
test = df[~df.index.isin(train.index)][[*features, target]].dropna().sample(TEST_NUM_SAMPLES, random_state=RANDOM_STATE)

In [None]:
test

In [None]:
train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)

Initialize model

In [None]:
MODEL_KWARGS = {
    'loss' : 'squared_loss',
    'penalty' : 'l2',
    'alpha' : 0.0001
}

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
reg = SGDRegressor(**MODEL_KWARGS)

Initialize metrics

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

Cross Validation

In [None]:
train = pd.read_csv('data/train.csv')

x_train = train[features].to_numpy()
y_train = train[target].to_numpy()

x_train = scaler.fit_transform(x_train)

In [None]:
from sklearn.model_selection import KFold

In [None]:
k_fold = KFold(5)

results = pd.DataFrame(columns=['alpha', 'MSE', 'R2'])
for _, (train_idx, val_idx) in enumerate(k_fold.split(x_train, y_train)):
    reg.fit(x_train[train_idx], y_train[train_idx])
    y_pred = reg.predict(x_train[val_idx])
    r2 = r2_score(y_true=y_train[val_idx], y_pred=y_pred)
    mse = mean_squared_error(y_true=y_train[val_idx], y_pred=y_pred)
    results = results.append({'alpha' : MODEL_KWARGS['alpha'], 'MSE' : mse, 'R2' : r2}, ignore_index=True)

In [None]:
results

In [None]:
results[['MSE', 'R2']].describe()

Train

In [None]:
reg.fit(x_train, y_train)

Test

In [None]:
test = pd.read_csv('data/test.csv')

x_test = test[features].to_numpy()
y_test = test[target].to_numpy()

x_test = scaler.transform(x_test)

In [None]:
y_pred = reg.predict(x_test)

mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
r2 = r2_score(y_true=y_test, y_pred=y_pred)

In [None]:
print(f'MSE: {mse},\nR2: {r2}.')