### ML

Используя наработки первой части (1) Analisys Flight Delay DS перейдем к машинному обучению

#### Подготовка данных для обучения

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

data = pd.read_csv('flight_delays_train.csv')
print(data.shape)
data

(100000, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


In [2]:
data['HourDepTime']=data['DepTime'].map(lambda x: str(x)[:-2])
data['HourDepTime'].replace(['25','24',''],[0,0,0],inplace=True)
data['HourDepTime']=data['HourDepTime'].astype(float)
data

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,HourDepTime
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,19.0
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,15.0
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,14.0
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,10.0
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,18.0
...,...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N,16.0
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N,8.0
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N,19.0
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N,15.0


Для обучения будем использовать только 1 признак HourDepTime, который показал линейную зависимость с целью.  
Остальные признаки, как показано в блокноте "(3) Дополнение, проверка других признаков и алгоритмов" существенного прироста AUC-score не привносят.

In [3]:
y = data['dep_delayed_15min']
X = data.drop(['Month','DayofMonth','UniqueCarrier','Distance','DayOfWeek','dep_delayed_15min','DepTime', 'Origin', 'Dest'], axis=1)
X

Unnamed: 0,HourDepTime
0,19.0
1,15.0
2,14.0
3,10.0
4,18.0
...,...
99995,16.0
99996,8.0
99997,19.0
99998,15.0


#### Обучение с помощью RandomForestClassifier

In [4]:
import sklearn
from sklearn.model_selection import train_test_split

# Разделяем на тренировочную выборку и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 11)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=11)
forest_params = {'criterion':['gini','entropy'],'max_depth': range(1,10), 'max_features': range(1,2)}
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=True)
forest_grid.fit(X_train, y_train)
print('Best cross-validation parameters:',forest_grid.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best cross-validation parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 1}


In [6]:
forest = RandomForestClassifier(criterion='gini', max_depth=3, max_features = 1, n_estimators=100, n_jobs=-1, random_state=11)
forest.fit(X_train, y_train)

y_test_predict_forest = forest.predict(X_test)


In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('FOREST')
print()

# Print Confusion matrix:
print('Confusion matrix:')
cmatrix = confusion_matrix(y_test, y_test_predict_forest)
print(cmatrix)
print()

# Print classification_report:
print('Classification_report:')
print(classification_report(y_test, y_test_predict_forest))

FOREST

Confusion matrix:
[[24238    50]
 [ 5644    68]]

Classification_report:
              precision    recall  f1-score   support

           N       0.81      1.00      0.89     24288
           Y       0.58      0.01      0.02      5712

    accuracy                           0.81     30000
   macro avg       0.69      0.50      0.46     30000
weighted avg       0.77      0.81      0.73     30000



In [8]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, forest.predict_proba(X_test)[:,1])

0.6916026127183417

#### Запись данных для реальной тестовой выборке

In [47]:
data_test = pd.read_csv('flight_delays_test.csv')
print(data_test.shape)
data_test

(100000, 8)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258
...,...,...,...,...,...,...,...,...
99995,c-6,c-5,c-2,852,WN,CRP,HOU,187
99996,c-11,c-24,c-6,1446,UA,ORD,LAS,1515
99997,c-1,c-30,c-2,1509,OO,ORD,SGF,438
99998,c-1,c-5,c-5,804,DL,LGA,ATL,761


Подготовим тестовые данные с одним признаком для применения на алгоритме

In [49]:
X_test_real= data_test
X_test_real['HourDepTime']=X_test_real['DepTime'].map(lambda x: str(x)[:-2])
X_test_real['HourDepTime'].replace(['25','24',''],[0,0,0],inplace=True)
X_test_real['HourDepTime']=X_test_real['HourDepTime'].astype(int)
X_test_real = X_test_real.drop(['Month','DayofMonth','UniqueCarrier','Distance','DayOfWeek','DepTime', 'Origin', 'Dest'], axis=1)
X_test_real

Unnamed: 0,HourDepTime
0,6
1,7
2,6
3,16
4,15
...,...
99995,8
99996,14
99997,15
99998,8


In [50]:
y_test_real_predict_proba= forest.predict_proba(X_test_real)[:,1]
y_test_real_predict_proba

array([0.05597634, 0.05597634, 0.05597634, ..., 0.22566406, 0.05684373,
       0.05684373])

Теперь запишем полученную вероятность для класса Y в датасет с тестовой выборкой

In [55]:
data_test['predict_Y_dep_delay'] = y_test_real_predict_proba
data_test

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,HourDepTime,predict_Y_dep_delay
0,c-7,c-25,c-3,615,YV,MRY,PHX,598,6,0.055976
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235,7,0.055976
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577,6,0.055976
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377,16,0.230697
4,c-6,c-6,c-3,1505,UA,ORD,STL,258,15,0.225664
...,...,...,...,...,...,...,...,...,...,...
99995,c-6,c-5,c-2,852,WN,CRP,HOU,187,8,0.056844
99996,c-11,c-24,c-6,1446,UA,ORD,LAS,1515,14,0.194710
99997,c-1,c-30,c-2,1509,OO,ORD,SGF,438,15,0.225664
99998,c-1,c-5,c-5,804,DL,LGA,ATL,761,8,0.056844


In [56]:
data_test.to_csv('predictY_flight_delays_test.csv') 