In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your original dataframe
# Create indices
df['comfort_index'] = (df['Seat comfort']*0.4 + df['Leg room service']*0.3 + df['On-board service']*0.3)
df['service_index'] = (df['Inflight service']*0.5 + df['Checkin service']*0.3 + df['On-board service']*0.2)
df['convenience_index'] = (df['Departure/Arrival time convenient']*0.4 + df['Gate location']*0.3 + df['Ease of Online booking']*0.3)
df['entertainment_index'] = (df['Inflight entertainment']*0.6 + df['Inflight wifi service']*0.4)
df['food_index'] = df['Food and drink']
df['cleanliness_index'] = df['Cleanliness']

# List of all features to standardize
features_to_standardize = [
    'comfort_index', 'service_index', 'convenience_index', 'entertainment_index', 
    'food_index', 'cleanliness_index', 'delay_index',
    'Seat comfort', 'Leg room service', 'On-board service', 'Inflight service',
    'Checkin service', 'Departure/Arrival time convenient', 'Gate location',
    'Ease of Online booking', 'Inflight entertainment', 'Inflight wifi service',
    'Food and drink', 'Cleanliness', 'Online boarding', 'Baggage handling'
]

# Standardize features
scaler = StandardScaler()
df[features_to_standardize] = scaler.fit_transform(df[features_to_standardize])

# Create interaction terms
df['comfort_service_interaction'] = df['comfort_index'] * df['service_index']
df['entertainment_wifi_interaction'] = df['entertainment_index'] * df['Inflight wifi service']

# Polynomial features for key indices
df['comfort_index_squared'] = df['comfort_index']**2
df['service_index_squared'] = df['service_index']**2

# Combine all features for the final dataset
final_features = features_to_standardize + [
    'comfort_service_interaction', 'entertainment_wifi_interaction',
    'comfort_index_squared', 'service_index_squared'
] + [col for col in df.columns if col.startswith(('Class_', 'Type_'))]  # One-hot encoded columns

# Create the final dataset
X = df[final_features]
y = df['satisfaction']  # Assuming 'satisfaction' is your target variable

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of training set:", X_train.shape)
print("Shape of testing set:", X_test.shape)

# Feature importance analysis using a Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': perm_importance.importances_mean
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))

In [None]:
Shape of training set: (83123, 27)
Shape of testing set: (20781, 27)

Top 10 most important features:
                   feature  importance
16   Inflight wifi service    0.709679
25               Class_Eco    0.289279
19         Online boarding    0.177271
13           Gate location    0.094272
14  Ease of Online booking    0.088922
15  Inflight entertainment    0.076277
9         On-board service    0.075628
2        convenience_index    0.063505
8         Leg room service    0.062677
10        Inflight service    0.062672