<div>
    <img src="https://live-production.wcms.abc-cdn.net.au/ac56ffe2b5282f82358e6b396e2da2ba?impolicy=wcms_crop_resize&cropH=1915&cropW=3404&xPos=5&yPos=0&width=862&height=485" width="500"/>
</div>


---

In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline           # для создания пайплайна
from sklearn.impute import SimpleImputer        # для заполнения пустых значений
from sklearn.preprocessing import OneHotEncoder # для перевода категориальных значений в числовые

from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

## 0. Problem Statement

About Company:

**TSI Airlines** - largest airline of Kyrgyzstan by size and passengers carried.

#### Problem
You need to create a model that will accurately predict passenger **satisfaction**.

In [2]:
# read the datafile
df = pd.read_csv("satisfaction_train.csv")

df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,87265,Male,Loyal Customer,25,Business travel,Business,3904,4,4,1,...,5,3,3,4,4,5,5,0,0.0,satisfied
1,65499,Female,disloyal Customer,26,Business travel,Business,1067,5,5,5,...,4,3,2,5,3,5,4,0,0.0,satisfied
2,10789,Female,Loyal Customer,55,Business travel,Business,3953,4,5,5,...,4,4,4,4,4,4,4,0,11.0,neutral or dissatisfied
3,5490,Male,Loyal Customer,32,Business travel,Eco,910,1,1,1,...,1,1,1,4,4,3,1,0,0.0,neutral or dissatisfied
4,99452,Female,Loyal Customer,36,Business travel,Business,283,1,1,1,...,5,5,5,5,3,5,3,17,20.0,satisfied


In [3]:
numerical_cols = ['Age', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
df = df.drop(columns = ['id'])

In [4]:
X = df.copy()
y = X.pop("satisfaction")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## 1. Data Preprocessing

In [5]:
# ФУНКЦИИ для автотматизации подготовки данных

def fill_missing_values(df: pd.DataFrame, strategy: str ='mean') -> pd.DataFrame:
    """Заполнение пустых значений указанными стратегиями"""
    df = df.copy()
    fill_value = 0
    for column in df.columns:
        if strategy == 'mean':
            fill_value = df[column].mean()
        elif strategy == 'median':
            fill_value = df[column].median()
        elif strategy == 'mode':
            fill_value = df[column].mode()[0] # берем первую моду
            
        ### TO DO
        df[column].fillna( value = fill_value, inplace=True )
    return df


def one_hot_encoder(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """Категорийные данные в численные"""
    df = df.copy()
    # Убираем численные столбцы - их не трогаем
    df_num = df.drop(columns = columns)
    
    # В датафрейме оставляем только категорийные столбцы
    df = df[columns].copy()
    
    # Новый датафрейм который будем возвращать
    df_prep = pd.DataFrame()
    
    for column in df.columns:
        # Уникальные значения столбца
        unique_values = df[column].unique()
        # Если уникальных значений меньше 2, то просто replace
        if len(unique_values) <= 2:
            df_prep[column] = df[column]
            for i in range(len(unique_values)):
                df_prep[column] = df_prep[column].replace({unique_values[i]: i})
        else: # Иначе get_dummies
            dummies = pd.get_dummies( df[column], prefix=column )
            df_prep = pd.concat([df_prep, dummies], axis=1)
    
    # Соединяем численные и категорийные столбцы
    return pd.concat([df_num, df_prep], axis=1)

In [6]:
df[numerical_cols] = fill_missing_values( df[numerical_cols] )
df[categorical_cols] = fill_missing_values(df[categorical_cols], strategy='mode')
df = one_hot_encoder( df, categorical_cols )

In [7]:
X = df.copy()

y = X.pop('satisfaction')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## 2. Modeling

In [10]:
model = RandomForestClassifier(
                               max_depth = None,
                               min_samples_split = 2,
                               n_estimators = 300)



model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)

(1.0, 0.9615625)

## 3. Hyperparameter Tuning (Find Best Parameters)

In [7]:
from sklearn.model_selection import GridSearchCV

In [None]:
list_n_estimators = [10, 30, 50, 70, 100, 120, 150, 200, 300]
list_max_depth = [1, 2, 4, 6, 8, 10, 13, 17, 20, None]
list_min_samples_split = [2, 3, 5, 7, 9, 11, 15]

In [None]:
grid = {'n_estimators': list_n_estimators,
        'max_depth': list_max_depth,
        'min_samples_split': list_min_samples_split}

In [None]:
model = RandomForestClassifier(random_state = 1)

In [None]:
search = GridSearchCV(estimator = model,
                      param_grid = grid,
                      n_jobs = -1,
                      cv = 3,
                     error_score='raise')

In [None]:
%%time
search.fit(X_train, y_train)

In [None]:
search.best_params_

In [None]:
model = RandomForestClassifier(
                               max_depth = None,
                               min_samples_split = 2,
                               n_estimators = 300)



model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)

## 4. Write Pipeline For Data Preparation and Prediction

In [14]:
# ФУНКЦИИ для автотматизации подготовки данных

def fill_missing_values(df: pd.DataFrame, strategy: str ='mean') -> pd.DataFrame:
    """Заполнение пустых значений указанными стратегиями"""
    df = df.copy()
    fill_value = 0
    for column in df.columns:
        if strategy == 'mean':
            fill_value = df[column].mean()
        elif strategy == 'median':
            fill_value = df[column].median()
        elif strategy == 'mode':
            fill_value = df[column].mode()[0] # берем первую моду
            
        ### TO DO
        df[column].fillna( value = fill_value, inplace=True )
    return df


def one_hot_encoder(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """Категорийные данные в численные"""
    df = df.copy()
    # Убираем численные столбцы - их не трогаем
    df_num = df.drop(columns = columns)
    
    # В датафрейме оставляем только категорийные столбцы
    df = df[columns].copy()
    
    # Новый датафрейм который будем возвращать
    df_prep = pd.DataFrame()
    
    for column in df.columns:
        # Уникальные значения столбца
        unique_values = df[column].unique()
        # Если уникальных значений меньше 2, то просто replace
        if len(unique_values) <= 2:
            df_prep[column] = df[column]
            for i in range(len(unique_values)):
                df_prep[column] = df_prep[column].replace({unique_values[i]: i})
        else: # Иначе get_dummies
            dummies = pd.get_dummies( df[column], prefix=column )
            df_prep = pd.concat([df_prep, dummies], axis=1)
    
    # Соединяем численные и категорийные столбцы
    return pd.concat([df_num, df_prep], axis=1)

## 5. Predict Test Data

### Read and Prepare test data using your pipeline

In [11]:
df = pd.read_csv("satisfaction_test.csv")

In [12]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,86338,Female,Loyal Customer,31,Business travel,Business,1669,1,1,1,...,4,4,3,2,5,3,4,4,2,14.0
1,102382,Male,Loyal Customer,38,Business travel,Eco,397,5,4,4,...,5,5,4,5,4,3,2,5,0,0.0
2,123516,Female,Loyal Customer,69,Personal Travel,Eco,2296,3,5,4,...,5,3,3,4,3,4,3,3,0,0.0
3,88120,Male,Loyal Customer,64,Business travel,Business,406,1,1,1,...,4,5,5,5,5,3,5,3,23,17.0
4,6557,Male,Loyal Customer,47,Business travel,Business,2022,5,5,5,...,5,4,4,4,4,5,4,3,0,


In [13]:
numerical_cols = ['Age', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
df = df.drop(columns = ['id'])

In [15]:
df[numerical_cols] = fill_missing_values( df[numerical_cols] )
df[categorical_cols] = fill_missing_values(df[categorical_cols], strategy='mode')
df = one_hot_encoder( df, categorical_cols )


In [16]:
score=model.score(X_test,y_test)
print("RANDOM FOREST:", score)

RANDOM FOREST: 0.9615625


### Make a prediction using your best model:

In [249]:
def predict(data, model_name):
    with open('Almazmodel.pkl', 'rb') as file:
        model = pickle.load(file)
        
    y_pred = model.predict(data)
    return y_pred

In [17]:
y_test_pred = model.predict(X_train)

model.predict(X_train)
model.score(X_train , y_train)

1.0

In [20]:
y_pred = model.predict(df)


df_predictions = pd.DataFrame({
    'my_pred': y_pred
})
df_predictions.to_csv('ALMAZ AKZHOLTEV LAB3 PRED.csv', index=False)

### Save predictions as `YourName.csv` and submit csv file and this notebook in ecourse

HINT: Use `df.to_csv('YourName.csv', index=False)`

In [252]:
df.to_csv('ALMAZ AKZHOLTEV FINAL.csv', index=False)