In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import dtale
#import streamlit as st
import mitosheet
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# modelos
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error as mae
import scikitplot as skplt
# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier

In [37]:
# IMPORTAMOS EL DATASET
df = pd.read_csv('/Users/gala/Desktop/Airlines/Aerolinea_equipo_4_Gala/Data/airline_passenger_satisfaction.csv')

# HEAD PARA VER LAS PRIMERAS FILAS
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [38]:
# VEMOS EL TAMAÑO DEL DATASET
df.shape

(103904, 25)

In [39]:
# LA CANTIDAD DE NULOS POR COLUMNA
df.isna().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [40]:
df['Arrival Delay in Minutes'].fillna(df['Departure Delay in Minutes'], inplace=True)

In [41]:
df.isna().sum()

Unnamed: 0                           0
id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

In [42]:
# Eliminamos las columnas que no vamos a usar
df.drop(['Unnamed: 0','id'],axis=1,inplace=True)

In [43]:
# Dividimos las columnas en categoricas y numericas
cat = ["Gender", "Customer Type", 'Type of Travel', 'Class', "Inflight wifi service", "Departure/Arrival time convenient", 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']
#train_cat_col = df_train.select_dtypes(include = "object").columns
num = ["Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]

In [44]:
# Define un mapeo de valores numéricos a las categorías
satisfaction_mapping = {'neutral or dissatisfied': 0, 'satisfied': 1}

# Aplica el mapeo a la columna 'satisfaction'
df['satisfaction'] = df['satisfaction'].map(satisfaction_mapping)

# Asegúrate de que ahora los valores sean numéricos
df['satisfaction'] = df['satisfaction'].astype(int)

In [45]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [46]:
# Scale numerical data to have mean=0 and variance=1
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# One-hot encode categorical data
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))])
#categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))])

In [47]:
# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num),
        ('cat', categorical_transformer, cat)],
        remainder='passthrough')
    


In [53]:
features = df.drop('satisfaction', axis=1)
target = df['satisfaction']


In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [55]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Print new shape
print('Training set shape:', X_train.shape)


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



Training set shape: (83123, 96)


#Получите имена колонок после трансформации
feature_names = preprocessor.get_feature_names_out()

#Выведите имена колонок
print('Feature names after transformation:', feature_names)


In [57]:
# Modelos

# testing with some of the regression models
models = [ # RandomForestClassifier(max_depth=2, random_state=0),                                                                 
           GradientBoostingClassifier(learning_rate=0.25, loss='exponential', max_depth=5, min_samples_split=4, n_estimators=50)
        #  LogisticRegression(),
        #  KNeighborsClassifier(),
        #  SVC(),
        #  XGBClassifier(),
        #  GaussianNB(),
        #  CatBoostClassifier(),
         ]


from pandas.io.parsers.readers import MANDATORY_DIALECT_ATTRS
#Cross validation
scores = {}

for i, model in enumerate(models):
  model.fit(X_train, y_train)
  model_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5)
  scores[f"model{i}"] = model_scores

for model_name, model_scores in scores.items():
  print (f"{model_name}:{model_scores}")
# np.random.seed(1)
# gbc = GradientBoostingClassifier()
# hyperparams = {
#     'n_estimators': [ 50, 100, 150], # 4 variants with diff num of trees
#     'max_depth': [2, 4, 5], # depth of each tree, Nonr means that model can choose by default
#     "min_samples_split": [4, 5],
#     "learning_rate": [0.09, 0.05, 0.25],
#     "loss": ['log_loss', 'exponential'],
#     'random_state': [0, 42]
# }
# cross_val = GridSearchCV(gbc, hyperparams, cv=5) #GridSearchCV module which used to adjust hyperparameters
# cross_val.fit(X_train, y_train.values.ravel())


#For evaluation and Define a prediction function for the given model
for mdl in models:
  y_pred = mdl.predict(X_val) #Define a prediction function for the given model
  accuracy = round(accuracy_score(y_val, y_pred), 3)
  precision = round(precision_score(y_val, y_pred), 3)
  recall = round(recall_score(y_val, y_pred), 3)

  print('Max Depth: {}  Estimators: {}  Accuracy: {}  Precision: {}  Recall: {}'.format(mdl.max_depth,
                                                                                                mdl.n_estimators,
                                                                                                accuracy,
                                                                                                precision,
                                                                                                recall))

KeyboardInterrupt: 