# Hackathon Airline Passenger Satisfaction

### Data Preprocessing


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [None]:
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:,2:-3].values
Y = dataset.iloc[:,-3].values

In [None]:
print(X)

[['Male' 'Loyal Customer' 13 ... 5 25 18.0]
 ['Male' 'disloyal Customer' 25 ... 1 1 6.0]
 ['Female' 'Loyal Customer' 26 ... 5 0 0.0]
 ...
 ['Male' 'disloyal Customer' 30 ... 4 7 14.0]
 ['Female' 'disloyal Customer' 22 ... 1 0 0.0]
 ['Male' 'Loyal Customer' 27 ... 1 0 0.0]]


In [None]:
print(Y)

['neutral or dissatisfied' 'neutral or dissatisfied' 'satisfied' ...
 'neutral or dissatisfied' 'neutral or dissatisfied'
 'neutral or dissatisfied']


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') 
X= np.array(ct.fit_transform(X))

In [None]:
print(X)

[[0.0 1.0 'Loyal Customer' ... 5 25 18.0]
 [0.0 1.0 'disloyal Customer' ... 1 1 6.0]
 [1.0 0.0 'Loyal Customer' ... 5 0 0.0]
 ...
 [0.0 1.0 'disloyal Customer' ... 4 7 14.0]
 [1.0 0.0 'disloyal Customer' ... 1 0 0.0]
 [0.0 1.0 'Loyal Customer' ... 1 0 0.0]]


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[2])],remainder='passthrough') 
X= np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 ... 5 25 18.0]
 [0.0 1.0 0.0 ... 1 1 6.0]
 [1.0 0.0 1.0 ... 5 0 0.0]
 ...
 [0.0 1.0 0.0 ... 4 7 14.0]
 [0.0 1.0 1.0 ... 1 0 0.0]
 [1.0 0.0 0.0 ... 1 0 0.0]]


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[5])],remainder='passthrough') 
X= np.array(ct.fit_transform(X))

In [None]:
print(X)

[[0.0 1.0 1.0 ... 5 25 18.0]
 [1.0 0.0 0.0 ... 1 1 6.0]
 [1.0 0.0 1.0 ... 5 0 0.0]
 ...
 [1.0 0.0 0.0 ... 4 7 14.0]
 [1.0 0.0 0.0 ... 1 0 0.0]
 [1.0 0.0 1.0 ... 1 0 0.0]]


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
#0 - Nuetral and dissatisfied
#1 - satisfied

In [None]:
print(Y)

[0 0 1 ... 0 0 0]


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,:])
X[:, :] = imputer.transform(X[:, :])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[0.0 1.0 1.0 ... 5.0 0.0 0.0]
 [1.0 0.0 1.0 ... 3.0 47.0 29.0]
 [1.0 0.0 0.0 ... 1.0 0.0 0.0]
 ...
 [1.0 0.0 1.0 ... 3.0 0.0 0.0]
 [1.0 0.0 0.0 ... 2.0 0.0 0.0]
 [1.0 0.0 1.0 ... 4.0 42.0 47.0]]


In [None]:
print(X_test)

[[1.0 0.0 1.0 ... 5.0 0.0 0.0]
 [1.0 0.0 1.0 ... 1.0 17.0 17.0]
 [0.0 1.0 1.0 ... 1.0 0.0 0.0]
 ...
 [1.0 0.0 1.0 ... 4.0 0.0 5.0]
 [1.0 0.0 1.0 ... 2.0 0.0 0.0]
 [1.0 0.0 1.0 ... 3.0 0.0 0.0]]


In [None]:
print(Y_train)

[0 0 1 ... 0 0 1]


In [None]:
print(Y_test)

[1 0 0 ... 1 0 1]


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 6:] = sc.fit_transform(X_train[:, 6:])
X_test[:, 6:] = sc.transform(X_test[:, 6:])

In [None]:
print(X_train)

[[0.0 1.0 1.0 ... 1.3007846408260046 -0.3861211998420926
  -0.3921030467012177]
 [1.0 0.0 1.0 ... -0.221804168414145 0.8471947739562575 0.362768710522216]
 [1.0 0.0 0.0 ... -1.7443929776542946 -0.3861211998420926
  -0.3921030467012177]
 ...
 [1.0 0.0 1.0 ... -0.221804168414145 -0.3861211998420926
  -0.3921030467012177]
 [1.0 0.0 0.0 ... -0.9830985730342198 -0.3861211998420926
  -0.3921030467012177]
 [1.0 0.0 1.0 ... 0.5394902362059298 0.715990946956433 0.8313098012126231]]


In [None]:
print(X_test)

[[1.0 0.0 1.0 ... 1.3007846408260046 -0.3861211998420926
  -0.3921030467012177]
 [1.0 0.0 1.0 ... -1.7443929776542946 0.0599718119573106
  0.05040798339527789]
 [0.0 1.0 1.0 ... -1.7443929776542946 -0.3861211998420926
  -0.3921030467012177]
 ...
 [1.0 0.0 1.0 ... 0.5394902362059298 -0.3861211998420926
  -0.26195274373166016]
 [1.0 0.0 1.0 ... -0.9830985730342198 -0.3861211998420926
  -0.3921030467012177]
 [1.0 0.0 1.0 ... -0.221804168414145 -0.3861211998420926
  -0.3921030467012177]]


In [None]:
#RANDOM FOREST CLASSIFIER METHOD USED
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 175, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', n_estimators=175, random_state=0)

In [277]:
from sklearn.metrics import confusion_matrix, accuracy_score
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
print("The Accuracy of the Model is: ")
accuracy_score(Y_test, Y_pred)*100

[[11620   255]
 [  556  8350]]
The Accuracy of the Model is: 


96.09739666041095