In [114]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [48]:
data = pd.read_csv('../datasets/flights.csv')

## Преобразование признаков

In [50]:
numeric = ['Day', 'Day Of Week', 'Origin Airport Delay Rate',
           'Destination Airport Delay Rate', 'Scheduled Time', 'Distance',
           'Scheduled Departure Hour', 'Scheduled Departure Minute']

features = data.drop(['Arrival Delay'], axis=1)
target = data['Arrival Delay']

features[numeric] = StandardScaler().fit_transform(features[numeric])
features = pd.get_dummies(features, drop_first=True)
features_train, features_valid, target_train, target_valid = train_test_split(features, target, random_state=12345)
features_train.shape, features_valid.shape

((58431, 22), (19478, 22))

## Линейная регрессия
---

### Обучение модели

In [75]:
model = LinearRegression()
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
mse = mean_squared_error(target_valid, predicted_valid)
mse, mse**0.5

(2129.8240528555293, 46.1500168240005)

### Проверка модели на адекватность

#### R2

In [91]:
r2_score(target_valid, predicted_valid)

0.09710497146204988

#### MAE

In [58]:
print("Linear Regression")
print(mean_absolute_error(target_valid, predicted_valid))
print("Median")
print(mean_absolute_error(target_valid, [target_valid.median() for _ in range(len(target_valid))]))

Linear Regression
27.436250978085834
Median
27.22281548413595


## Случайный лес
---

### Поиск наилучшей модели

#### R2

In [133]:
best_r2 = 0
best_depth = 0
for depth in range(1, 16):
    model = RandomForestRegressor(n_estimators=10, max_depth=depth, random_state=12345)
    model.fit(features_train, target_train)
    r2 = model.score(features_valid, target_valid)
    if r2 > best_r2:
        best_r2 = r2
        best_depth = depth
best_depth, best_r2

KeyboardInterrupt: 

In [None]:
%%time

best_r2 = 0
best_n = 0
for n in range(50, 200, 10):
    model = RandomForestRegressor(n_estimators=n, max_depth=11, random_state=12345)
    model.fit(features_train, target_train)
    r2 = model.score(features_valid, target_valid)
    if r2 > best_r2:
        best_r2 = r2
        best_n = n
best_n, best_r2

In [137]:
model = RandomForestRegressor(n_estimators=150, max_depth=11, random_state=12345)
model.fit(features_train, target_train) 
model.score(features_train, target_train), model.score(features_valid, target_valid)

(0.37975279322458466, 0.16512230031108122)

#### MAE

In [None]:
best_mae = 100
best_depth = 100
for depth in range(1, 16):
    model = RandomForestRegressor(n_estimators=10, max_depth=depth, random_state=12345)
    model.fit(features_train, target_train)
    mae = mean_absolute_error(target_valid, model.predict(features_valid))
    if mae < best_mae:
        best_mae = mae
        best_depth = depth
best_depth, best_mae

In [None]:
%%time

best_mae = 100
best_n = 100
for n in range(50, 200, 10):
    model = RandomForestRegressor(n_estimators=n, max_depth=11, random_state=12345)
    model.fit(features_train, target_train)
    mae = mean_absolute_error(target_valid, model.predict(features_valid))
    if mae < best_mae:
        best_mae = mae
        best_n = n
best_n, best_mae

In [None]:
model = RandomForestRegressor(n_estimators=80, max_depth=11, random_state=12345)
model.fit(features_train, target_train) 
mean_absolute_error(target_train, model.predict(features_train)), mean_absolute_error(target_valid, model.predict(features_valid))

### cross_val

In [95]:
model = RandomForestRegressor(n_estimators=20, max_depth=11, random_state=12345)

In [97]:
scores = cross_val_score(model, features_train, target_train, scoring='r2')
scores.mean()

0.1516308303711201

In [103]:
predictes = cross_val_predict(model, features_train, target_train)
predictes

array([-0.90671435,  1.77251997, -1.45439924, ..., 20.35549953,
        0.74818487, -2.55475086])

In [105]:
r2_score(target_train, predictes)

0.14989645039777244

### GridSearchCV

In [139]:
%%time

params = {
    'n_estimators': [150],
    'max_depth': [11]
}
clf = GridSearchCV(model, params)
clf.fit(features_train, target_train)
clf.cv_results_

CPU times: total: 1min 8s
Wall time: 1min 8s


{'mean_fit_time': array([10.76186585]),
 'std_fit_time': array([0.19691745]),
 'mean_score_time': array([0.12728381]),
 'std_score_time': array([0.00444961]),
 'param_max_depth': masked_array(data=[11],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[150],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 11, 'n_estimators': 150}],
 'split0_test_score': array([0.18040455]),
 'split1_test_score': array([0.13468957]),
 'split2_test_score': array([0.17506374]),
 'split3_test_score': array([0.14368533]),
 'split4_test_score': array([0.18535532]),
 'mean_test_score': array([0.1638397]),
 'std_test_score': array([0.02058749]),
 'rank_test_score': array([1])}

In [140]:
clf.best_params_

{'max_depth': 11, 'n_estimators': 150}

In [141]:
clf.best_score_

0.1638397007070916