In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# File path
file_path = r"C:\Users\ashwi\GUVI_Projects\Flight Project\Passenger_Satisfaction_Cleaned.csv"  # Update with your file path

# Load the dataset
df = pd.read_csv(file_path)

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(exclude=['number']).columns

# Fill missing values for numeric columns with the mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing values for categorical columns with 'Unknown'
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Encoding categorical variables using Label Encoding
label_cols = ['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 
              'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment',
              'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 
              'Cleanliness', 'Gender_Female', 'Gender_Male', 'Customer Type_Loyal Customer', 'Customer Type_disloyal Customer',
              'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus']

# Apply Label Encoding to categorical columns
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

# Encode the target column 'satisfaction'
df['satisfaction'] = le.fit_transform(df['satisfaction'])

# Feature columns and target column
X = df.drop(columns=['satisfaction'])  # Features
y = df['satisfaction']  # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Model: Logistic Regression
Accuracy: 0.8767624272171696

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89     11713
           1       0.87      0.84      0.86      9068

    accuracy                           0.88     20781
   macro avg       0.88      0.87      0.87     20781
weighted avg       0.88      0.88      0.88     20781

--------------------------------------------------
Model: Random Forest
Accuracy: 0.9629950435493961

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     11713
           1       0.97      0.94      0.96      9068

    accuracy                           0.96     20781
   macro avg       0.96      0.96      0.96     20781
weighted avg       0.96      0.96      0.96     20781

--------------------------------------------------
Model: Support Vector Machine
Accuracy: 0.9537558346566575

Classification Report:
       