In [1]:
!pip install xgboost
!pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1


In [62]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier 
from catboost import Pool, CatBoostClassifier
from catboost.utils import get_confusion_matrix

In [13]:
data = pd.read_csv('data/AirPass.csv')
data.drop(data.columns[[0]], axis=1, inplace=True)
data

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [24]:
data.isnull().sum()

id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

In [25]:
data['Arrival Delay in Minutes'].mean()

15.133392362180475

In [22]:
data['Arrival Delay in Minutes'].fillna(0, inplace=True)

In [30]:
data_gr_gender = data.groupby(['Class', 'satisfaction'])['id'].count().reset_index()
data_gr_gender

Unnamed: 0,Class,satisfaction,id
0,Business,neutral or dissatisfied,15185
1,Business,satisfied,34480
2,Eco,neutral or dissatisfied,38044
3,Eco,satisfied,8701
4,Eco Plus,neutral or dissatisfied,5650
5,Eco Plus,satisfied,1844


In [32]:
data['satisfaction'] = data['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
data['Customer Type'] = data['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
data['Type of Travel'] = data['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  int64  
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  int64  
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [34]:
categoricals = ['Class', 'Gender']
df_dummies = pd.get_dummies(data, columns=categoricals)
df_dummies

Unnamed: 0,id,Customer Type,Age,Type of Travel,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Class_Business,Class_Eco,Class_Eco Plus,Gender_Female,Gender_Male
0,70172,1,13,0,460,3,4,3,1,5,...,5,5,25,18.0,0,0,0,1,0,1
1,5047,0,25,1,235,3,2,3,3,1,...,4,1,1,6.0,0,1,0,0,0,1
2,110028,1,26,1,1142,2,2,2,2,5,...,4,5,0,0.0,1,1,0,0,1,0
3,24026,1,25,1,562,2,5,5,5,2,...,4,2,11,9.0,0,1,0,0,1,0
4,119299,1,61,1,214,3,3,3,3,4,...,3,3,0,0.0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,0,23,1,192,2,1,2,3,2,...,3,2,3,0.0,0,0,1,0,1,0
103900,73097,1,49,1,2347,4,4,4,4,2,...,5,4,0,0.0,1,1,0,0,0,1
103901,68825,0,30,1,1995,1,1,1,3,4,...,5,4,7,14.0,0,1,0,0,0,1
103902,54173,0,22,1,1000,1,1,1,5,1,...,4,1,0,0.0,0,0,1,0,1,0


In [35]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 27 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Customer Type                      103904 non-null  int64  
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  int64  
 4   Flight Distance                    103904 non-null  int64  
 5   Inflight wifi service              103904 non-null  int64  
 6   Departure/Arrival time convenient  103904 non-null  int64  
 7   Ease of Online booking             103904 non-null  int64  
 8   Gate location                      103904 non-null  int64  
 9   Food and drink                     103904 non-null  int64  
 10  Online boarding                    103904 non-null  int64  
 11  Seat comfort                       1039

In [38]:
X = df_dummies.drop('satisfaction', axis=1)
y = df_dummies['satisfaction']
X.shape, y.shape

((103904, 26), (103904,))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=26)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((83123, 26), (20781, 26), (83123,), (20781,))

In [42]:
scaler = StandardScaler()

X_trs = scaler.fit_transform(X_train)
X_ts = scaler.transform(X_test)

In [43]:
X_ts

array([[ 0.94082514,  0.47381845,  0.37290362, ..., -0.27897268,
        -1.0182236 ,  1.0182236 ],
       [-0.86687265,  0.47381845,  1.49930106, ..., -0.27897268,
         0.98210256, -0.98210256],
       [-1.63303947, -2.11051299, -1.15104586, ..., -0.27897268,
         0.98210256, -0.98210256],
       ...,
       [ 1.52620083, -2.11051299, -1.15104586, ..., -0.27897268,
        -1.0182236 ,  1.0182236 ],
       [-0.60330678,  0.47381845,  0.83671433, ..., -0.27897268,
        -1.0182236 ,  1.0182236 ],
       [ 1.34108544,  0.47381845, -0.02464842, ..., -0.27897268,
        -1.0182236 ,  1.0182236 ]])

#### Логистическая регрессия

In [45]:
lr = LogisticRegression()

lr.fit(X_trs, y_train)

y_train_predict = lr.predict(X_trs)
y_test_predict = lr.predict(X_ts)

print('Логистическая регрессия, обучающая выборка f1 score: {:.3f}'.format(f1_score(y_train, y_train_predict)))
print('Логистическая регрессия, тестовая выборка f1 score: {:.3f}'.format(f1_score(y_test, y_test_predict)))


Логистическая регрессия, обучающая выборка f1 score: 0.854
Логистическая регрессия, тестовая выборка f1 score: 0.855


#### AdaBoost

In [48]:
aboost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(random_state=26),
    learning_rate=0.01,
    random_state=26
    )

aboost.fit(X_trs, y_train)

y_train_predict = aboost.predict(X_trs)
y_test_predict = aboost.predict(X_ts)

print('AdaBoost, обучающая выборка f1 score: {:.3f}'.format(f1_score(y_train, y_train_predict)))
print('AdaBoost, тестовая выборка f1 score: {:.3f}'.format(f1_score(y_test, y_test_predict)))

AdaBoost, обучающая выборка f1 score: 1.000
AdaBoost, тестовая выборка f1 score: 0.940


#### Градиентный бустинг

In [51]:
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}

gsv = GridSearchCV(
    estimator=GradientBoostingClassifier(), 
    param_grid=params,
    cv=3
)

gsv.fit(X_trs, y_train)
gsv.best_params_

{'learning_rate': 1.0, 'n_estimators': 128}

In [53]:
gsv.best_score_

0.9563658696077831

In [52]:
gboost = GradientBoostingClassifier(learning_rate=1.0, n_estimators=128)
gboost.fit(X_trs, y_train)

y_train_predict = gboost.predict(X_trs)
y_test_predict = gboost.predict(X_ts)

print('GradientBoosting, обучающая выборка f1 score: {:.3f}'.format(f1_score(y_train, y_train_predict)))
print('GradientBoosting, тестовая выборка f1 score: {:.3f}'.format(f1_score(y_test, y_test_predict)))

GradientBoosting, обучающая выборка f1 score: 0.961
GradientBoosting, тестовая выборка f1 score: 0.951


#### XGBoost

In [55]:
xgboost = XGBClassifier(random_state = 26)
xgboost.fit(X_trs, y_train)

y_train_predict = xgboost.predict(X_trs)
y_test_predict = xgboost.predict(X_ts)

print('XGBoost, обучающая выборка f1 score: {:.3f}'.format(f1_score(y_train, y_train_predict)))
print('XGBoost, тестовая выборка f1 score: {:.3f}'.format(f1_score(y_test, y_test_predict)))

XGBoost, обучающая выборка f1 score: 0.975
XGBoost, тестовая выборка f1 score: 0.958


#### CatBoostClassifier

In [58]:
catboost = CatBoostClassifier(random_state = 26)
catboost.fit(X_trs, y_train)

y_train_predict = catboost.predict(X_trs)
y_test_predict = catboost.predict(X_ts)

print('CatBoost, обучающая выборка f1 score: {:.3f}'.format(f1_score(y_train, y_train_predict)))
print('CatBoost, тестовая выборка f1 score: {:.3f}'.format(f1_score(y_test, y_test_predict)))

Learning rate set to 0.068023
0:	learn: 0.6008191	total: 63.1ms	remaining: 1m 3s
1:	learn: 0.5274362	total: 119ms	remaining: 59.3s
2:	learn: 0.4526907	total: 213ms	remaining: 1m 10s
3:	learn: 0.4085036	total: 277ms	remaining: 1m 9s
4:	learn: 0.3754344	total: 318ms	remaining: 1m 3s
5:	learn: 0.3316901	total: 369ms	remaining: 1m 1s
6:	learn: 0.3110767	total: 418ms	remaining: 59.3s
7:	learn: 0.2929223	total: 467ms	remaining: 57.9s
8:	learn: 0.2769612	total: 517ms	remaining: 56.9s
9:	learn: 0.2562048	total: 564ms	remaining: 55.9s
10:	learn: 0.2412529	total: 661ms	remaining: 59.4s
11:	learn: 0.2275560	total: 756ms	remaining: 1m 2s
12:	learn: 0.2162922	total: 832ms	remaining: 1m 3s
13:	learn: 0.2098957	total: 879ms	remaining: 1m 1s
14:	learn: 0.1997358	total: 916ms	remaining: 1m
15:	learn: 0.1950370	total: 948ms	remaining: 58.3s
16:	learn: 0.1870090	total: 981ms	remaining: 56.7s
17:	learn: 0.1830433	total: 1.01s	remaining: 55s
18:	learn: 0.1798688	total: 1.04s	remaining: 53.7s
19:	learn: 0.1

In [63]:
get_confusion_matrix(catboost, Pool(X_trs, y_train))

array([[46664.,   542.],
       [ 1257., 34660.]])

#### Важные признаки

In [64]:
features = X_train.columns
feature_import_dict = {}

importance = catboost.feature_importances_

for i in range(0,len(importance)):
    feature_import_dict[features[i]] = importance[i]

feature_import = pd.DataFrame.from_dict(list(feature_import_dict.items()))

dfh = sorted(feature_import_dict.items(), key=lambda item: item[1])
dfh.reverse()
dfh

[('Inflight wifi service', 26.252262577201343),
 ('Type of Travel', 19.42465768665394),
 ('Online boarding', 7.020790514593494),
 ('Customer Type', 6.859378956382707),
 ('Class_Business', 4.817540039654346),
 ('Gate location', 3.7237965769860546),
 ('Checkin service', 3.6525972335458383),
 ('Baggage handling', 3.5999009347916355),
 ('Seat comfort', 3.2423044551635276),
 ('Age', 3.238331752716842),
 ('Inflight entertainment', 2.799303755739565),
 ('Inflight service', 2.576591252808898),
 ('id', 1.9646608624751996),
 ('Flight Distance', 1.7216473696774863),
 ('Cleanliness', 1.6811865139750486),
 ('Ease of Online booking', 1.4834644706420932),
 ('On-board service', 1.4372081097910077),
 ('Departure/Arrival time convenient', 1.3598360970576462),
 ('Leg room service', 1.1016563172020877),
 ('Arrival Delay in Minutes', 0.8615169192306723),
 ('Departure Delay in Minutes', 0.44066471697164283),
 ('Food and drink', 0.3003980041567558),
 ('Class_Eco', 0.1922933511803407),
 ('Class_Eco Plus', 0.1