In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df=pd.read_csv(r"C:\Users\samir\Downloads\airline_passenger_satisfaction.csv")

# Manejo de Valores faltantes
# Copia del DataFrame original para preservar los datos
df_original = df.copy()

# Imputación con la mediana
median_value = df['Arrival Delay in Minutes'].median()
df['Imputed_Median'] = df_original['Arrival Delay in Minutes'].fillna(median_value) 

# Codificación de la variable objetivo  
label_encoder = LabelEncoder()  
df['satisfaction'] = label_encoder.fit_transform(df['satisfaction'])  

# Codificación de variables categóricas nominales  
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  

# Ajustar y transformar las variables categóricas  
categorical_columns = ['Class', 'Type of Travel']  
encoded_features = one_hot_encoder.fit_transform(df[categorical_columns])  

# Crear DataFrame para las variables codificadas  
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(categorical_columns))  

# Concatenar los DataFrames codificados al original  
df = pd.concat([df, encoded_df], axis=1)  

# Codificación de 'Customer Type' y 'Gender'  
df['Customer Type'] = label_encoder.fit_transform(df['Customer Type'])  
df['Gender'] = label_encoder.fit_transform(df['Gender'])  

# Eliminar las columnas originales  
#df.drop(categorical_columns, axis=1, inplace=True)  
# Crear una lista de columnas a eliminar excluyendo 'Imputed_NegativeOne'
columns_to_drop = [col for col in categorical_columns if col != 'Imputed_Median']

# Eliminar las columnas especificadas
df.drop(columns_to_drop, axis=1, inplace=True)

# Mostrar el DataFrame final  
#print(df.head(5))
#print(df.describe())



# Crear el escalador
scaler = StandardScaler()

# Seleccionar las columnas numéricas
numeric_cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Imputed_Median']

# Aplicar el escalador
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
#print(df[numeric_cols].head(3))
#print(df[numeric_cols].describe())

#corr_matrix = df[numeric_cols].corr("spearman")
#sns.heatmap(corr_matrix, annot=True,cmap="coolwarm")
#plt.show()

#standarización de variables categoricas 
categorical_cols=["Customer Type","Gender","satisfaction"]
df[categorical_cols] = scaler.fit_transform(df[categorical_cols])
print(df[categorical_cols].describe())

df['delay_index'] = df[['Departure Delay in Minutes', 'Imputed_Median']].mean(axis=1)

# Print column names and data types
#print("Columns in the DataFrame:")
#print(df.columns)
#print("\nData types:")
#print(df.dtypes)

# Define feature groups
ordinal_features = [
    'Seat comfort', 'Leg room service', 'On-board service', 'Inflight service',
    'Checkin service', 'Departure/Arrival time convenient', 'Gate location',
    'Ease of Online booking', 'Inflight entertainment', 'Inflight wifi service',
    'Food and drink', 'Cleanliness', 'Online boarding', 'Baggage handling'
]

# Check if all ordinal features exist in the DataFrame
existing_ordinal_features = [feature for feature in ordinal_features if feature in df.columns]
missing_ordinal_features = set(ordinal_features) - set(existing_ordinal_features)
if missing_ordinal_features:
    print(f"Warning: The following ordinal features are missing: {missing_ordinal_features}")

# Create indices only for existing features
df['comfort_index'] = df[['Seat comfort', 'Leg room service', 'On-board service']].mean(axis=1)
df['service_index'] = df[['Inflight service', 'Checkin service', 'On-board service']].mean(axis=1)
df['convenience_index'] = df[['Departure/Arrival time convenient', 'Gate location', 'Ease of Online booking']].mean(axis=1)
df['entertainment_index'] = df[['Inflight entertainment', 'Inflight wifi service']].mean(axis=1)
df['food_index'] = df['Food and drink']
df['cleanliness_index'] = df['Cleanliness']

index_features = [
    'comfort_index', 'service_index', 'convenience_index', 'entertainment_index',
    'food_index', 'cleanliness_index'
]

# Updated categorical features based on the new column names
categorical_features = ['Class_Eco', 'Class_Eco Plus', 'Type of Travel_Personal Travel']

# Check for the updated categorical feature columns
existing_categorical_features = [feature for feature in categorical_features if feature in df.columns]
missing_categorical_features = set(categorical_features) - set(existing_categorical_features)
if missing_categorical_features:
    print(f"Warning: The following categorical features are missing: {missing_categorical_features}")

# Assuming these are your LabelEncoded features that are already standardized
label_encoded_features = ['Customer Type', 'Gender', 'satisfaction']

# Check if all label_encoded_features exist in the DataFrame
existing_label_encoded_features = [feature for feature in label_encoded_features if feature in df.columns]
missing_label_encoded_features = set(label_encoded_features) - set(existing_label_encoded_features)
if missing_label_encoded_features:
    print(f"Warning: The following label-encoded features are missing: {missing_label_encoded_features}")

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), existing_ordinal_features + index_features),
        ('cat', 'passthrough', existing_categorical_features)  # We use 'passthrough' since these are already one-hot encoded
    ]
)

# Fit and transform the data
X = df[existing_ordinal_features + index_features + existing_categorical_features + existing_label_encoded_features]
y = df['satisfaction'] if 'satisfaction' in df.columns else None

if y is None:
    print("Error: 'satisfaction' column is missing. Cannot proceed with the analysis.")
else:
    X_processed = preprocessor.fit_transform(X)

    # Get feature names after preprocessing
    feature_names = existing_ordinal_features + index_features + existing_categorical_features + existing_label_encoded_features

    print("\nNumber of features after preprocessing:", X_processed.shape[1])
    print("Number of feature names:", len(feature_names))



# Crear una función para evaluar los modelos
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

# Define los modelos como un diccionario
models = {
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVC': SVC(random_state=42),
    'MLPRegressor': MLPRegressor(random_state=42)
}

# Define la cuadrícula de hiperparámetros
param_grid = {
    'RandomForestRegressor': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd']
    }
}

# Ajuste de hiperparámetros y evaluación
for model_name, model in models.items():
    print(f"Evaluando modelo: {model_name}")
    grid = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5)
    grid.fit(X_train, y_train)
    print(f"Mejores hiperparámetros para {model_name}: {grid.best_params_}")
    print(f"Mejor puntuación para {model_name}: {grid.best_score_}")



# Feature selection (optional)
selector = SelectFromModel(best_model, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)


# Train the final model with selected features (optional)
final_model = best_model
final_model.fit(X_train_selected, y_train)

# Evaluate the final model (optional)
evaluate_model(final_model, X_test_selected, y_test)


       Customer Type        Gender  satisfaction
count   1.039040e+05  1.039040e+05  1.039040e+05
mean    2.995243e-17 -3.767988e-17 -3.839792e-17
std     1.000005e+00  1.000005e+00  1.000005e+00
min    -4.727667e-01 -9.851920e-01 -8.744735e-01
25%    -4.727667e-01 -9.851920e-01 -8.744735e-01
50%    -4.727667e-01 -9.851920e-01 -8.744735e-01
75%    -4.727667e-01  1.015031e+00  1.143545e+00
max     2.115208e+00  1.015031e+00  1.143545e+00

Number of features after preprocessing: 23
Number of feature names: 26
Evaluando modelo: RandomForestRegressor


NameError: name 'X_train' is not defined