In [167]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix
from catboost import Pool

In [137]:
df = pd.read_csv('data/AirPass.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [138]:
df.isnull().sum().sum()

np.int64(310)

In [139]:
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())
df['Arrival Delay in Minutes'].mean()

np.float64(15.133392362180475)

In [140]:
df.groupby('Gender')['satisfaction'].value_counts()*100/df.shape[0]

df.groupby('Type of Travel')['satisfaction'].value_counts()*100/df.shape[0]

df.groupby('Class')['satisfaction'].value_counts()*100/df.shape[0]

Class     satisfaction           
Business  satisfied                  33.184478
          neutral or dissatisfied    14.614452
Eco       neutral or dissatisfied    36.614567
          satisfied                   8.374076
Eco Plus  neutral or dissatisfied     5.437712
          satisfied                   1.774715
Name: count, dtype: float64

In [141]:
df['satisfaction'] = df['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
df['Customer Type'] = df['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
df['Type of Travel'] = df['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

In [142]:
df=pd.get_dummies(df)
df.shape

(103904, 27)

In [143]:
X = df.drop('satisfaction', axis=1)
y = df['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=26)
y_test.shape


(20781,)

In [144]:
scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
X_test[0][0]

np.float64(1.193295224074621)

In [145]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
preds_test = model_lr.predict(X_test)
f1_score(preds_test, y_test)

np.float64(0.8550407188991856)

In [146]:
model_ada = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,learning_rate=0.01)

model_ada.fit(X_train, y_train)
preds_test = model_ada.predict(X_test)
f1_score(preds_test, y_test)



np.float64(0.9376063216813917)

In [147]:
model_for_gs = GradientBoostingClassifier()
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
gs = GridSearchCV(model_for_gs, 
                  params, 
                  cv=3, 
                  scoring=make_scorer(f1_score),
                  verbose=5)
 
gs.fit(X_train, y_train)
 
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение метрики:", gs.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .learning_rate=1.0, n_estimators=1;, score=0.873 total time=   0.3s
[CV 2/3] END .learning_rate=1.0, n_estimators=1;, score=0.870 total time=   0.3s
[CV 3/3] END .learning_rate=1.0, n_estimators=1;, score=0.871 total time=   0.3s
[CV 1/3] END .learning_rate=1.0, n_estimators=2;, score=0.880 total time=   0.6s
[CV 2/3] END .learning_rate=1.0, n_estimators=2;, score=0.878 total time=   0.6s
[CV 3/3] END .learning_rate=1.0, n_estimators=2;, score=0.875 total time=   0.7s
[CV 1/3] END .learning_rate=1.0, n_estimators=4;, score=0.901 total time=   1.3s
[CV 2/3] END .learning_rate=1.0, n_estimators=4;, score=0.896 total time=   1.4s
[CV 3/3] END .learning_rate=1.0, n_estimators=4;, score=0.897 total time=   1.3s
[CV 1/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   2.8s
[CV 2/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   2.5s
[CV 3/3] END .learning_rate=1.0, n_estimators=8;

In [148]:
model_xgb = XGBClassifier(random_state=26)
model_xgb.fit(X_train,y_train)
preds_test = model_xgb.predict(X_test)
f1_score(preds_test, y_test)

np.float64(0.9573191063687896)

In [149]:
model = CatBoostClassifier(random_state=26)
model.fit(X_train, y_train)
preds_class = model.predict(X_test)
f1_score(preds_class, y_test)

Learning rate set to 0.068023
0:	learn: 0.6018138	total: 19.2ms	remaining: 19.2s
1:	learn: 0.5117898	total: 42.2ms	remaining: 21.1s
2:	learn: 0.4550663	total: 65.8ms	remaining: 21.9s
3:	learn: 0.4109981	total: 88ms	remaining: 21.9s
4:	learn: 0.3586401	total: 105ms	remaining: 20.9s
5:	learn: 0.3283924	total: 134ms	remaining: 22.2s
6:	learn: 0.3068410	total: 157ms	remaining: 22.3s
7:	learn: 0.2872778	total: 179ms	remaining: 22.2s
8:	learn: 0.2702367	total: 198ms	remaining: 21.9s
9:	learn: 0.2568042	total: 226ms	remaining: 22.4s
10:	learn: 0.2460247	total: 250ms	remaining: 22.5s
11:	learn: 0.2371696	total: 274ms	remaining: 22.6s
12:	learn: 0.2227434	total: 298ms	remaining: 22.6s
13:	learn: 0.2113300	total: 326ms	remaining: 22.9s
14:	learn: 0.2019336	total: 368ms	remaining: 24.2s
15:	learn: 0.1969025	total: 415ms	remaining: 25.5s
16:	learn: 0.1919289	total: 466ms	remaining: 26.9s
17:	learn: 0.1882537	total: 513ms	remaining: 28s
18:	learn: 0.1816900	total: 554ms	remaining: 28.6s
19:	learn: 

np.float64(0.9603382099349168)

In [174]:
# Создаем предсказания на обучающей выборке
y_train_pred = model.predict(Pool(X_train, y_train))

# Получаем матрицу ошибок
cm = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[46684   522]
 [ 1238 34679]]


In [175]:
pd.DataFrame(
    {
        "feature_importance": clf.get_feature_importance(),
        "feature_names": df.drop(columns="satisfaction").columns,
    }
).sort_values(by=["feature_importance"], ascending=False)

NameError: name 'clf' is not defined