In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
trainData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/train.csv'
testData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/test.csv'
subData  = f'https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/submission.csv'

In [3]:
train = pd.read_csv(trainData)
test = pd.read_csv(testData)
sub = pd.read_csv(subData)

In [4]:
train.isna().sum()

id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             256
satisfaction                           0
dtype: int64

In [5]:
train.shape, test.shape, sub.shape

((83123, 24), (20781, 23), (20781, 1))

In [6]:
X = train.drop(['id','Arrival Delay in Minutes', 'satisfaction'], axis=1)
y = train['satisfaction']

In [7]:
X = pd.get_dummies(X)
y = y.map(lambda x: 1 if x == 'satisfied' else 0)
cols = X.columns

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=2020)

In [9]:
rf = RandomForestClassifier(random_state=2020).fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:, 1]))

0.9618638970204917
0.9554044548651818
0.9939074831694907


In [10]:
gb = GradientBoostingClassifier(random_state=2020).fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9402895296146289
0.9304952621014797


In [11]:
ab = AdaBoostClassifier(random_state=2020).fit(X_train, y_train)
y_pred = ab.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9268155752496291
0.9151912263581022


In [12]:
dc = DecisionTreeClassifier(random_state=2020).fit(X_train, y_train)
y_pred = dc.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9440590287524562
0.9358650177003356


In [13]:
test = test.drop(['id','Arrival Delay in Minutes'], axis=1)
test = pd.get_dummies(test)[cols]
test_pred = rf.predict(test)

In [14]:
sub[0] = test_pred
sub.to_csv('17026.csv', index=False)