<a href="https://colab.research.google.com/github/G2454/UTFPR-IA-25.2/blob/main/Projeto_Final_Airline_Passenger_Satisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Base de dados : Airline Passenger Satisfaction

- https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
- Classe: Satisfaction (Satisfaction, neutral or dissatisfaction)



In [1]:
import numpy as np
import pandas as pd

# Modelo machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Validação Cruzada
from sklearn.model_selection import (
    KFold,
    LeaveOneOut,
    StratifiedKFold,
    cross_validate
)

# Métricas
from sklearn.metrics import (recall_score,
                             accuracy_score,
                             precision_score,
                             f1_score)
from sklearn.metrics import classification_report

In [2]:
def carregaBaseDados(nome):
  return pd.read_csv(nome)

# Pré processamento


In [4]:
# Lista das colunas numéricas

def getNumericColumns(dataframe):

  return dataframe.drop('satisfaction', axis=1).select_dtypes(include=['int64','float64']).columns.tolist()



# Limpeza de dados faltantes

def trataDadosFaltantes(dataframe, column):

   dataframe[column].fillna(dataframe[column].mean(), inplace = True)



# Remoção atributos irrelevantes e transformação dados categóricos

def preProcessamento(dataframe, rem_cols, transform_column):



  # Remoção de columnas irrelevantes

  dataframe.drop(rem_cols, axis=1, inplace=True)



  # Transforma os dados da classe satisfaction

  labelencoder = LabelEncoder()

  for col in transform_column:

      dataframe[col] = labelencoder.fit_transform(dataframe[col])



# Separa atributos da classe (X,y)

def separaClasse(dataframe, classe):

  X = dataframe.drop(classe, axis=1)

  y = dataframe[classe]

  return X,y



### Realizando testes com os dados:

# Gera o dataframe

df = carregaBaseDados('test.csv')



# Substituição de dados faltantes

trataDadosFaltantes(df, 'Arrival Delay in Minutes')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(dataframe[column].mean(), inplace = True)


In [6]:

# Aplica pré-processamento

# Informar os atributos categóricos na ordem em que eles aparecem na figura 1 (cima para baixo)

# Identify categorical columns to transform. Exclude numerical ones and 'Unnamed: 0', 'id' already removed.
categorical_cols_to_encode = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']

preProcessamento(df, ['Unnamed: 0', 'id'], categorical_cols_to_encode)


# Seleciona as colunas numéricas
numeric = getNumericColumns(df)

# Normalização dos dados
std=StandardScaler()
df[numeric] = std.fit_transform(df[numeric])


# Separa atributos e classe
X, y = separaClasse(df,'satisfaction')

# Seleção automática dos hiperparâmetros com GridSearchCV
DT = DecisionTreeClassifier()
params = {'criterion': ['gini', 'entropy', 'log_loss'],
              'splitter': ['best', 'random'],
              'max_features': [None, 1.0, 'sqrt', 'log2']
              }


# Gera objeto GridSearchCV
from sklearn.model_selection import GridSearchCV
g_search = GridSearchCV(estimator = DT, param_grid = params,
                        cv = 10)


# Treina o conjunto de hiperparâmetros
g_search.fit(X, y)


#Apresenta o melhor conjunto de hiperparâmetros
print(g_search.best_params_)


#Apresenta o melhor score (acurácia)
print(g_search.best_score_)

{'criterion': 'entropy', 'max_features': None, 'splitter': 'best'}
0.9398675204972399
