In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
#pd.set_option("display.max.rows",None)
pd.set_option("display.max.columns",None)

import pickle
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier


from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer, PowerTransformer
from sklearn.compose import make_column_transformer, ColumnTransformer



In [2]:
# Function que facilita la exploración básica del dataset
def summary(df):
    total_values = df.shape[0]
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / total_values) * 100
    print(f"Dataset has {df.shape[1]} features and {df.shape[0]} rows.")    
    summary = pd.DataFrame(index=df.columns)
    summary["Unique"] = df.nunique().values
    summary["Missing"] = df.isnull().sum().values
    summary['Missing %'] = ((missing_data / total_values) * 100).round(2)
    summary["Duplicated"] = df.duplicated().sum()
    summary["Types"] = df.dtypes
    return summary

In [3]:
# Creamos los Dataframes
df =pd.read_csv('airline_passenger_satisfaction.csv')

In [4]:
df.shape


(103904, 25)

In [5]:
df.columns


Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [6]:
summary(df)

Dataset has 25 features and 103904 rows.


Unnamed: 0,Unique,Missing,Missing %,Duplicated,Types
Unnamed: 0,103904,0,0.0,0,int64
id,103904,0,0.0,0,int64
Gender,2,0,0.0,0,object
Customer Type,2,0,0.0,0,object
Age,75,0,0.0,0,int64
Type of Travel,2,0,0.0,0,object
Class,3,0,0.0,0,object
Flight Distance,3802,0,0.0,0,int64
Inflight wifi service,6,0,0.0,0,int64
Departure/Arrival time convenient,6,0,0.0,0,int64


In [7]:
df.head(10)


Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied
5,5,111157,Female,Loyal Customer,26,Personal Travel,Eco,1180,3,4,2,1,1,2,1,1,3,4,4,4,4,1,0,0.0,neutral or dissatisfied
6,6,82113,Male,Loyal Customer,47,Personal Travel,Eco,1276,2,4,2,3,2,2,2,2,3,3,4,3,5,2,9,23.0,neutral or dissatisfied
7,7,96462,Female,Loyal Customer,52,Business travel,Business,2035,4,3,4,4,5,5,5,5,5,5,5,4,5,4,4,0.0,satisfied
8,8,79485,Female,Loyal Customer,41,Business travel,Business,853,1,2,2,2,4,3,3,1,1,2,1,4,1,2,0,0.0,neutral or dissatisfied
9,9,65725,Male,disloyal Customer,20,Business travel,Eco,1061,3,3,3,4,2,3,3,2,2,3,4,4,3,2,0,0.0,neutral or dissatisfied


**Observaciones**
- Columnas `Unnamed`, `id` no creo que sean importantes y se podrían eliminar. Además tienen una cardinalidad muy alta
- La columna `Baggage handling`tiene 5 valores en vez de 6 como todas las demás. Empieza en 1 en vez de 0
- `Arrival Delay in Minutes` es la única que es flotante pasarla a int.
- `Arrival Delay in Minutes` es la única que tiene missign values = 310, un 0.3% Muy poquito.Imputamos

In [8]:
(df['Baggage handling'] == 0).sum()

0

# Exploratory Data Analysis (EDA) #

## Target Variable ##

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='satisfaction')

In [None]:
(df['satisfaction'] == 'neutral or dissatisfied').sum()

In [None]:
(df['satisfaction'] == 'satisfied').sum()

**La Variable objetivo está desbalanceada. De momento lo voy a dejar así.**

In [None]:
#Variables numéricas
cat_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class'] 
num_cols = df.select_dtypes(include=['number'])
df.describe()

## Age 

In [None]:
df['Age'].describe()

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(data=df, x="Age",  hue='satisfaction')
plt.title('Age Feature Distribution')

### La columna Customer type. Tengo que pasar el valor de disloyal Customer a Disloyal Customer 

In [9]:
# Cambiar solo 'disloyal Customer' a 'Disloyal Customer'
df.loc[df['Customer Type'] == 'disloyal Customer', 'Customer Type'] = 'Disloyal Customer'

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,Disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied


#### Hay que hacer algo con la distribución de la edad.

# ****** Para probar Streamlit sólo voy a dejar 4 columnas ***********

In [11]:
features = ["Inflight wifi service", "Food and drink", "Customer Type", "Gender"] 

In [12]:
col_drops= df.columns
col_drops

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

## Pre-procesamiento ##

### Imputamos missing values ###

Elimino `Unnamed` y `id`

In [13]:
#Eliminamos las columnas de los dos datasets
cols_drop = ['Unnamed: 0', 'id', 'Age', 'Type of Travel', 'Class', 'Flight Distance','Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
df.drop(columns =cols_drop, inplace=True)


In [14]:
df.head()

Unnamed: 0,Gender,Customer Type,Inflight wifi service,Food and drink,satisfaction
0,Male,Loyal Customer,3,5,neutral or dissatisfied
1,Male,Disloyal Customer,3,1,neutral or dissatisfied
2,Female,Loyal Customer,2,5,satisfied
3,Female,Loyal Customer,2,2,neutral or dissatisfied
4,Male,Loyal Customer,3,4,satisfied


In [None]:
summary(df)

In [None]:
cat_cols = ['Customer Type', 'Gender'] 

In [None]:
num_cols = ['Inflight wifi service', 'Food and drink']
num_cols

In [None]:
imputer1 = SimpleImputer(strategy="most_frequent")     ##To fill Categorical Features.
imputer2 = SimpleImputer(strategy="mean")     # Numerical Features  

In [None]:
def fill_missingno(df):
    df[cat_cols] = imputer1.fit_transform(df[cat_cols])
    df[num_cols] = imputer2.fit_transform(df[num_cols])

In [None]:
fill_missingno(df)

In [None]:
summary(df)

In [None]:
df.isnull().sum()

In [None]:
# Categoricas las pasamos a booleans
#df[cat_cols] = df[cat_cols].astype(bool)

In [None]:
summary(df)

# Defino funcion para evaluar los modelos

In [None]:
## Una función única
def model_prediction(model):
    model.fit(X_train_transformed, y_train) # Entreno el modelo
    #y_pred = model.predict(data_to_pred)
    
    x_train_pred = model.predict(X_train_transformed)
    x_test_pred = model.predict(X_test_transformed)
    #y_pred = model.predict(X_final_transformed)
    a = accuracy_score(y_train,x_train_pred)*100
    b = accuracy_score(y_test,x_test_pred)*100
    
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)
    print("\n------------------------------------------------------------------------")
    #print(f"Precision Score of {model} model is:",precision_score(y_test,x_test_pred))
    #print(f"Recall Score of {model} model is:",recall_score(y_test,x_test_pred))
    #print(f"F1 Score of {model} model is:",f1_score(y_test,x_test_pred))
    print("\n------------------------------------------------------------------------")
    print(f"Confusion Matrix of {model} model is:")
    cm = confusion_matrix(y_test,x_test_pred)
    plt.figure(figsize=(8,4))
    sns.heatmap(cm,annot=True,fmt="g",cmap="summer")
    plt.show()

    return x_test_pred
    #return y_pred
    

## Testing Pipeline y Pickle

In [None]:
# Separar Variable Objetivo, target o variable dependiente de las variables independientes
df_f = pd.DataFrame(df)
y = df_f["satisfaction"]
X = df_f.drop(columns="satisfaction")

In [None]:
# Divido los dataset en training y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Columnas categóricas y numéricas. 
num_cols = ['Inflight wifi service', 'Food and drink']
cat_cols = ['Customer Type', 'Gender'] 

In [None]:

# Nuevos pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', MinMaxScaler())
    ])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse=False)), 
    ])

In [None]:
#Aplicar los transformer a nuestras features usando ColumnTransformer, es nuestro pre-procesamiento.

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', num_transformer, num_cols),
        ('categorical', cat_transformer, cat_cols)
    ])

In [None]:
model = LogisticRegression(solver= 'liblinear',penalty='l1')

# Estimator o modelo a aplicar
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('logistic', model)
])

In [None]:
# Aplicar el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

In [None]:
# Obtiene las predicciones del modelo en los datos de entrenamiento
y_train_pred = pipeline.predict(X_train)

In [None]:
# Guardar el pipeline usando Pickle
with open('data_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

# Cargar el pipeline desde el archivo
with open('data_pipeline.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

In [None]:
       
# Aplicar el pipeline a los datos de prueba
pipeline.fit(X_test, y_test)

In [None]:
# Obtiene las predicciones del modelo en los datos de prueba
y_test_pred = pipeline.predict(X_test)

# *************************** Datos de prueba finales *************************************

# Aplicar el pipeline a los datos de prueba finales
#X_final_transformed = loaded_pipeline.transform(df_test)

In [None]:
y_test_pred

# Probando los modelos

In [None]:
training_score = []
testing_score = []

In [None]:
a = accuracy_score(y_train,y_train_pred)*100
b = accuracy_score(y_test,y_test_pred)*100
    
training_score.append(a)
testing_score.append(b)
    
print(f"Accuracy_Score of {model} model on Training Data is:",a)
print(f"Accuracy_Score of {model} model on Testing Data is:",b)

## 1. Logistic-Regression Model


In [None]:
model_prediction(LogisticRegression(solver= 'liblinear',penalty='l1'))

In [None]:
# KNeighborsClassifier
model_prediction(KNeighborsClassifier())

In [None]:
# SVC
#model_prediction(SVC())

In [None]:
model_prediction(AdaBoostClassifier(n_estimators=200,  random_state=1))