In [None]:
import pandas as pd
import sklearn as sk
from sklearn import tree
from sklearn import ensemble
from sklearn import model_selection
from sklearn import metrics

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df = pd.concat([df_train, df_test])
df = pd.concat([df_train, df_test]).drop(columns=["Unnamed: 0", "id"])
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [None]:
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})
df["Customer Type"] = df["Customer Type"].map({"Loyal Customer": 0, "disloyal Customer": 1})
df["Class"] = df["Class"].map({"Eco": 0, "Eco Plus": 1, "Business": 2})
df["Type of Travel"] = df["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1})
df["satisfaction"] = df["satisfaction"].map({"satisfied": 1, "neutral or dissatisfied" : 0})
df.head()

In [None]:
#df.isna().sum()
df = df.fillna(0.0)

In [None]:
df["Total delay"] = df['Departure Delay in Minutes'] + df['Arrival Delay in Minutes']

In [None]:
df.dtypes

Gender                                 int64
Customer Type                          int64
Age                                    int64
Type of Travel                         int64
Class                                  int64
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
satisfacti

In [None]:
X = df[df.columns.drop("satisfaction")]
y = df["satisfaction"]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

## DecisionTreeClassifier

In [None]:
criterion = "gini"
max_depth = 17
clf = sk.tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(criterion, max_depth, sk.metrics.accuracy_score(y_test, y_pred))

gini 17 0.9533030489682784


## Ensembles

In [None]:
criterion = "gini"
max_depth = 17
n_estimators = 200
clf = sk.ensemble.RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(n_estimators, criterion, max_depth, sk.metrics.accuracy_score(y_test, y_pred))
# 200 gini 17 0.9609639667385279

print(pd.DataFrame({
    "feature":X.columns,
    "importance":clf.feature_importances_
}).sort_values("importance", ascending=False))


In [None]:
n_estimators = 400
clf = sk.ensemble.GradientBoostingClassifier(n_estimators=n_estimators, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(n_estimators, sk.metrics.accuracy_score(y_test, y_pred))

# 400 0.9573837388358485

print(pd.DataFrame({
    "feature":X.columns,
    "importance":clf.feature_importances_
}).sort_values("importance", ascending=False))

In [None]:
estimators = [
    ('rf', sk.ensemble.RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=17)),
    ('gbt', sk.ensemble.GradientBoostingClassifier(n_estimators=400, random_state=42)),
]
final_estimator = sk.linear_model.LogisticRegression()

clf = sk.ensemble.StackingClassifier(estimators, final_estimator=final_estimator, n_jobs=-1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(sk.metrics.accuracy_score(y_test, y_pred))

# 400 0.9606944872189713