##Load, Explore and Summerize

In [None]:
import pandas as pd
import numpy as np

# Load the train, test, and sample submission datasets
train_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/train.csv'
test_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/test.csv'
submission_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(submission_path)

# Display basic information about the datasets
train_info = train_df.info()
test_info = test_df.info()

# Display first few rows of the train and test data
train_head = train_df.head()
test_head = test_df.head()

# Display the first few rows of the sample submission file
sample_submission_head = sample_submission_df.head()

train_info, train_head, test_info, test_head, sample_submission_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116892 entries, 0 to 116891
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Id                                 116892 non-null  int64  
 1   satisfaction                       116892 non-null  object 
 2   Gender                             116892 non-null  object 
 3   Customer Type                      116892 non-null  object 
 4   Age                                116892 non-null  int64  
 5   Type of Travel                     116892 non-null  object 
 6   Class                              116892 non-null  object 
 7   Flight Distance                    116892 non-null  int64  
 8   Seat comfort                       116892 non-null  int64  
 9   Departure/Arrival time convenient  116892 non-null  int64  
 10  Food and drink                     116892 non-null  int64  
 11  Gate location                      1168

(None,
        Id  satisfaction  Gender   Customer Type  Age   Type of Travel  \
 0   86347     satisfied    Male  Loyal Customer   50  Business travel   
 1  115822     satisfied    Male  Loyal Customer   51  Business travel   
 2   16351  dissatisfied    Male  Loyal Customer   14  Personal Travel   
 3  107284     satisfied  Female  Loyal Customer   52  Business travel   
 4    5788  dissatisfied  Female  Loyal Customer   26  Personal Travel   
 
       Class  Flight Distance  Seat comfort  Departure/Arrival time convenient  \
 0  Business             1548             5                                  5   
 1  Business             4020             5                                  5   
 2       Eco             2328             2                                  5   
 3  Business             3761             2                                  2   
 4       Eco             3348             1                                  5   
 
    ...  Online support  Ease of Online booking  On-b

In [None]:
print(train_df.columns.tolist())

['Id', 'satisfaction', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient', 'Food and drink', 'Gate location', 'Inflight wifi service', 'Inflight entertainment', 'Online support', 'Ease of Online booking', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']


##Preprocessing and Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Dropping the ID column (not useful for modeling) from both train and test
train_df_clean = train_df.drop(columns=['Id'])
test_df_clean = test_df.drop(columns=['Id'])

# Impute missing values in the 'Arrival Delay in Minutes' with median
imputer = SimpleImputer(strategy='median')
train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

# Encode categorical variables using LabelEncoder
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df_clean[col] = le.fit_transform(train_df_clean[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Encode the target variable 'satisfaction'
train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.head(), y_train.head()

(       Gender  Customer Type  Age  Type of Travel  Class  Flight Distance  \
 94821       1              1   31               0      0             2702   
 83138       1              0   49               0      0             2301   
 7527        1              0   24               1      1             1215   
 15564       0              0   53               0      1              238   
 42362       1              0   26               0      0             3123   
 
        Seat comfort  Departure/Arrival time convenient  Food and drink  \
 94821             2                                  2               2   
 83138             2                                  2               2   
 7527              4                                  5               4   
 15564             5                                  2               2   
 42362             4                                  4               2   
 
        Gate location  ...  Online support  Ease of Online booking  \
 94821  

##Basic Random Forest Classifier Model Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_val_pred = rf_model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

val_accuracy, val_classification_report

(0.9586380940159973,
 '              precision    recall  f1-score   support\n\n           0       0.94      0.97      0.95     10585\n           1       0.97      0.95      0.96     12794\n\n    accuracy                           0.96     23379\n   macro avg       0.96      0.96      0.96     23379\nweighted avg       0.96      0.96      0.96     23379\n')

First attempt with Basic RFC 5/10/24; score - 0.97 ~ 0.988033

##Prediction and Submission File Prep

In [None]:
test_predictions = rf_model.predict(test_df_clean)

submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})

submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

submission_file_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/submission.csv'
submission_df.to_csv(submission_file_path, index=False)

submission_df.head(), submission_file_path

(       ID  satisfaction
 0   46587  dissatisfied
 1  124920     satisfied
 2   18490     satisfied
 3   78644  dissatisfied
 4   92713     satisfied,
 '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/submission.csv')

##Tune-maxxing the Random Forest

In [None]:
!pip install category_encoders
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder

# Define the preprocess_data function
def preprocess_data(train_df, test_df):
    # Separate features and target variable
    X = train_df.drop(['satisfaction', 'Id'], axis=1)
    y = train_df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})
    X_test = test_df.drop('Id', axis=1)

    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', TargetEncoder())  # Using TargetEncoder
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Handle missing values in the target variable (y) before fitting
    y = y.fillna(y.mode()[0]) # Fill missing values with the mode

    # Fit and transform the training data
    X = preprocessor.fit_transform(X, y)

    # Transform the test data
    X_test = preprocessor.transform(X_test)

    return X, X_test, y

# Load and preprocess
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X, X_test, y = preprocess_data(train_df, test_df)  # Use the preprocessed data

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter Tuning with RandomizedSearchCV (More efficient than GridSearchCV)
param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Explore a range
    'max_depth': [None, 10, 20, 30],         # Include None for full depth
    'min_samples_split': [2, 5, 10],        # Experiment with splitting criteria
    'min_samples_leaf': [1, 2, 4],          # Control leaf size
    'max_features': ['sqrt', 'log2', None], # Or a float between 0 and 1
    'bootstrap': [True, False],            # Bagging (with replacement) or Pasting
    'class_weight': [None, 'balanced', 'balanced_subsample'] # Adjust for class imbalance (if any)
}



rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=50,
    scoring='accuracy', cv=5, n_jobs=-1, verbose=2, random_state=42
)

random_search.fit(X_train, y_train)



# Evaluate and Print Best Parameters
print("Best Hyperparameters:", random_search.best_params_)
best_rf = random_search.best_estimator_


y_pred = best_rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")
print(classification_report(y_val, y_pred))


# Train best model on full training data (if needed for final prediction on the test set). If not needed, skip and use random search for test predictions.

best_rf.fit(X, y) # Fit on the full training set


# Make predictions on the test set
test_predictions = best_rf.predict(X_test)

# Prepare submission
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_df.to_csv('random_forest_tuned_submission.csv', index=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'class_weight': 'balanced_subsample', 'bootstrap': False}
Validation Accuracy: 1.0
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     23379

    accuracy                           1.00     23379
   macro avg       1.00      1.00      1.00     23379
weighted avg       1.00      1.00      1.00     23379



#Before implementing more complex models-

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

## Hyperparameter Tuning (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to search for the best hyperparameters
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search
grid_search_rf.fit(X_train_scaled, y_train)

# Best parameters and best score
print("Best Parameters: ", grid_search_rf.best_params_)
print("Best Cross-Validation Score: ", grid_search_rf.best_score_)

# Train on full training data with best params
best_rf_model = grid_search_rf.best_estimator_
best_rf_model.fit(X_train_scaled, y_train)

# Predict on the test set
test_predictions_rf = best_rf_model.predict(test_scaled)

# Submission
submission_rf = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_rf
})
submission_rf['satisfaction'] = submission_rf['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_rf.to_csv('submission_rf_optimized.csv', index=False)

##XGBoost Implementation

In [None]:
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Check the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Train the model with the best parameters
xgb_best = grid_search.best_estimator_

# Evaluate on the validation set
y_val_pred_xgb = xgb_best.predict(X_val)
xgb_accuracy = accuracy_score(y_val, y_val_pred_xgb)
print(f"XGBoost Validation Accuracy: {xgb_accuracy}")

test_predictions_xgb = xgb_best.predict(test_df_clean)

# Prepare submission file
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_xgb
})

# Convert satisfaction back to original labels
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission
submission_df.to_csv('submission_xgboost.csv', index=False)

submission_df.head()

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Parameters: { "use_label_encoder" } are not used.



Best hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'subsample': 1}
XGBoost Validation Accuracy: 0.9609478591898712


Unnamed: 0,Id,satisfaction
0,46587,dissatisfied
1,124920,satisfied
2,18490,satisfied
3,78644,dissatisfied
4,92713,satisfied


##Binary Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Set parameter grid for hyperparameter tuning
param_grid_dt = {
    'max_depth': [3, 6, 9],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [5, 10, 20]
}

# Perform Grid Search
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_dt.fit(X_train, y_train)

# Get best estimator and evaluate on validation set
dt_best = grid_search_dt.best_estimator_
y_val_pred_dt = dt_best.predict(X_val)
dt_accuracy = accuracy_score(y_val, y_val_pred_dt)
print(f"Decision Tree Validation Accuracy: {dt_accuracy}")

# Make predictions on the test set
test_predictions_dt = dt_best.predict(test_df_clean)

# Prepare the submission DataFrame
submission_dt = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_dt
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_dt['satisfaction'] = submission_dt['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_dt.to_csv('submission_decision_tree.csv', index=False)
print("Decision Tree submission file saved as 'submission_decision_tree.csv'")

  _data = np.array(data, dtype=dtype, copy=copy,


Decision Tree Validation Accuracy: 0.910004705077206
Decision Tree submission file saved as 'submission_decision_tree.csv'




##Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df_clean)  # Ensure the test set is scaled as well

# Initialize Logistic Regression with L2 regularization
logreg_model = LogisticRegression(solver='liblinear', random_state=42)

# Fit the model
logreg_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred_logreg = logreg_model.predict(X_val_scaled)
logreg_accuracy = accuracy_score(y_val, y_val_pred_logreg)
print(f"Logistic Regression Validation Accuracy: {logreg_accuracy}")

# Make predictions on the test set
test_predictions_logreg = logreg_model.predict(test_scaled)

# Prepare the submission DataFrame
submission_logreg = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_logreg
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_logreg['satisfaction'] = submission_logreg['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_logreg.to_csv('submission_logistic_regression.csv', index=False)
print("Logistic Regression submission file saved as 'submission_logistic_regression.csv'")

Logistic Regression Validation Accuracy: 0.8364344069464049
Logistic Regression submission file saved as 'submission_logistic_regression.csv'


##Support Vector Machines (SVMs)

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize SVM with linear kernel for speed
svm_model = SVC(kernel='linear', random_state=42)

# Fit and evaluate
svm_model.fit(X_train_scaled, y_train)
y_val_pred_svm = svm_model.predict(X_val_scaled)

# Evaluate performance
svm_accuracy = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Validation Accuracy: {svm_accuracy}")

# Scale the test set
test_scaled = scaler.transform(test_df_clean)

# Make predictions on the test set
test_predictions_svm = svm_model.predict(test_scaled)

# Prepare the submission DataFrame
submission_svm = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_svm
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_svm['satisfaction'] = submission_svm['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_svm.to_csv('submission_svm.csv', index=False)
print("SVM submission file saved as 'submission_svm.csv'")

SVM Validation Accuracy: 0.8404123358569656
SVM submission file saved as 'submission_svm.csv'


##Ensemble Methods; Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Define individual models
logreg = LogisticRegression(solver='liblinear', random_state=42)
dt = DecisionTreeClassifier(max_depth=6, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create Voting Classifier (hard voting)
voting_clf = VotingClassifier(estimators=[('lr', logreg), ('dt', dt), ('rf', rf)], voting='hard')

# Fit the model
voting_clf.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_voting = voting_clf.predict(X_val)
voting_accuracy = accuracy_score(y_val, y_val_pred_voting)
print(f"Voting Classifier Validation Accuracy: {voting_accuracy}")

# Make predictions on the test set
test_predictions_voting = voting_clf.predict(test_df_clean)

# Prepare the submission
submission_voting = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_voting
})

submission_voting['satisfaction'] = submission_voting['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_voting.to_csv('submission_voting.csv', index=False)
print("Voting Classifier submission file saved as 'submission_voting.csv'")

Voting Classifier Validation Accuracy: 0.9304076307797596
Voting Classifier submission file saved as 'submission_voting.csv'


##Ensemble Stacking

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Assuming 'train_df' is your training data and 'test_df' is your test data
X = train_df.drop(columns=['satisfaction'])  # Drop the target column from training data
y = train_df['satisfaction']  # Extract the target column

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical features
categorical_features = ['Gender', 'Customer Type', 'Type of Travel', 'Class']  # Add all your categorical columns here
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Create transformers for numerical and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
test_scaled = preprocessor.transform(test_df)  # Apply the same transformation to test data

# Define base models
base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('catboost', CatBoostClassifier(verbose=0)),
    ('lightgbm', LGBMClassifier())
]

# Define stacking classifier with logistic regression as the meta-model
stacking_clf = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression(), cv=5)

# Train the stacking model
stacking_clf.fit(X_train_scaled, y_train)

# Predict on the validation set
y_val_pred_stacking = stacking_clf.predict(X_val_scaled)
stacking_accuracy = accuracy_score(y_val, y_val_pred_stacking)
print(f"Stacking Classifier Validation Accuracy: {stacking_accuracy}")

# Predict on test data
test_predictions_stacking = stacking_clf.predict(test_scaled)

# Prepare submission for Stacking
submission_stacking = pd.DataFrame({
    'ID': test_df['Id'],  # Assuming 'Id' is the identifier column in test_df
    'satisfaction': test_predictions_stacking
})

# Correct the mapping: Ensure predictions are mapped to the expected labels
submission_stacking['satisfaction'] = submission_stacking['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save to CSV for Kaggle submission
submission_stacking.to_csv('submission_stacking.csv', index=False)

print("Submission file saved successfully!")

[LightGBM] [Info] Number of positive: 51179, number of negative: 42334
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 93513, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547293 -> initscore=0.189739
[LightGBM] [Info] Start training from score 0.189739
[LightGBM] [Info] Number of positive: 40943, number of negative: 33867
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 74810, number of used features: 28
[LightGBM] [Info] [b

##AdaBoost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Initialize AdaBoost with DecisionTree as base estimator (using 'estimator' instead of 'base_estimator')
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),  # Fix: changed base_estimator to estimator
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# Fit the model
ada_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_ada = ada_model.predict(X_val)
ada_accuracy = accuracy_score(y_val, y_val_pred_ada)
print(f"AdaBoost Validation Accuracy: {ada_accuracy}")

# Make predictions on the test set
test_predictions_ada = ada_model.predict(test_df_clean)

# Prepare the submission DataFrame
submission_ada = pd.DataFrame({
    'Id': test_df['Id'],
    'satisfaction': test_predictions_ada
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_ada['satisfaction'] = submission_ada['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_ada.to_csv('submission_adaboost.csv', index=False)
print("AdaBoost submission file saved as 'submission_adaboost.csv'")



AdaBoost Validation Accuracy: 0.9275418110269901
AdaBoost submission file saved as 'submission_adaboost.csv'




##CatBoost

In [None]:
!pip install catboost
from catboost import CatBoostClassifier

# Initialize CatBoost classifier
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=0, random_state=42)

# Fit the model
cat_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_cat = cat_model.predict(X_val)
cat_accuracy = accuracy_score(y_val, y_val_pred_cat)
print(f"CatBoost Validation Accuracy: {cat_accuracy}")

# Make predictions on the test set
test_predictions_cat = cat_model.predict(test_df_clean)

# Prepare the submission
submission_cat = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_cat
})

submission_cat['satisfaction'] = submission_cat['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_cat.to_csv('submission_catboost.csv', index=False)
print("CatBoost submission file saved as 'submission_catboost.csv'")

CatBoost Validation Accuracy: 0.9613755934813294
CatBoost submission file saved as 'submission_catboost.csv'


##LightBGM

In [None]:
!pip install lightgbm
import lightgbm as lgb

# Initialize LightGBM model
lgb_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)

# Fit the model
lgb_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_lgb = lgb_model.predict(X_val)
lgb_accuracy = accuracy_score(y_val, y_val_pred_lgb)
print(f"LightGBM Validation Accuracy: {lgb_accuracy}")

# Make predictions on the test set
test_predictions_lgb = lgb_model.predict(test_df_clean)

# Prepare the submission
submission_lgb = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_lgb
})

submission_lgb['satisfaction'] = submission_lgb['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_lgb.to_csv('submission_lightgbm.csv', index=False)
print("LightGBM submission file saved as 'submission_lightgbm.csv'")

[LightGBM] [Info] Number of positive: 51176, number of negative: 42337
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 934
[LightGBM] [Info] Number of data points in the train set: 93513, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547261 -> initscore=0.189609
[LightGBM] [Info] Start training from score 0.189609
LightGBM Validation Accuracy: 0.9579109457205184
LightGBM submission file saved as 'submission_lightgbm.csv'


##K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize KNN (optimize for Colab by using smaller K)
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Fit the model (scale data first)
knn_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred_knn = knn_model.predict(X_val_scaled)
knn_accuracy = accuracy_score(y_val, y_val_pred_knn)
print(f"KNN Validation Accuracy: {knn_accuracy}")

# Make predictions on the test set (scaled test set)
test_scaled = scaler.transform(test_df_clean)
test_predictions_knn = knn_model.predict(test_scaled)

# Prepare the submission
submission_knn = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_knn
})

submission_knn['satisfaction'] = submission_knn['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_knn.to_csv('submission_knn.csv', index=False)
print("KNN submission file saved as 'submission_knn.csv'")

KNN Validation Accuracy: 0.9236066555455751
KNN submission file saved as 'submission_knn.csv'


##Neural Networks; Multilayer Perceptron(MLP) using Keras

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# List of categorical columns (example: adjust based on your dataset)
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

# Apply label encoding to categorical columns in both training and test data
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Scale the data (only after encoding)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df_clean)  # Ensure the test set is scaled as well

# Build the neural network (this part remains the same)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create Neural Network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_val_scaled, y_val))

# Evaluate the model
nn_accuracy = model.evaluate(X_val_scaled, y_val, verbose=0)[1]
print(f"Neural Network Validation Accuracy: {nn_accuracy}")

# Make predictions on the test set
test_predictions_nn = model.predict(test_scaled)

# Convert probabilities to binary predictions
test_predictions_nn = (test_predictions_nn > 0.5).astype(int).flatten()

# Prepare the submission DataFrame
submission_nn = pd.DataFrame({
    'Id': test_df['Id'],
    'satisfaction': test_predictions_nn
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_nn['satisfaction'] = submission_nn['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_nn.to_csv('submission_nn.csv', index=False)
print("Neural Network submission file saved as 'submission_nn.csv'")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8819 - loss: 0.2732 - val_accuracy: 0.9302 - val_loss: 0.1691
Epoch 2/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9342 - loss: 0.1570 - val_accuracy: 0.9383 - val_loss: 0.1462
Epoch 3/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.9405 - loss: 0.1388 - val_accuracy: 0.9439 - val_loss: 0.1331
Epoch 4/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9472 - loss: 0.1250 - val_accuracy: 0.9435 - val_loss: 0.1281
Epoch 5/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9475 - loss: 0.1200 - val_accuracy: 0.9494 - val_loss: 0.1234
Epoch 6/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9507 - loss: 0.1146 - val_accuracy: 0.9509 - val_loss: 0.1172
Epoch 7/10
[1m2923/2923[

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd

# List of categorical columns (adjust based on your dataset)
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

# Apply label encoding to categorical columns in both training and test data
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Scale the data after encoding
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df_clean)

# Build the neural network using TensorFlow Keras
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_val_scaled, y_val))

# Evaluate the model
nn_accuracy = model.evaluate(X_val_scaled, y_val, verbose=0)[1]
print(f"Neural Network Validation Accuracy: {nn_accuracy}")

# Make predictions on the test set
test_predictions_nn = model.predict(test_scaled)

# Convert probabilities to binary predictions (0 or 1)
test_predictions_nn = (test_predictions_nn > 0.5).astype(int).flatten()

# Prepare the submission DataFrame
submission_nn = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions_nn
})

# Convert numeric labels back to original text ('satisfied', 'dissatisfied')
submission_nn['satisfaction'] = submission_nn['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save submission to CSV
submission_nn.to_csv('submission_nn.csv', index=False)
print("Neural Network submission file saved as 'submission_nn_keras.csv'")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.8790 - loss: 0.2826 - val_accuracy: 0.9242 - val_loss: 0.1781
Epoch 2/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.9351 - loss: 0.1568 - val_accuracy: 0.9385 - val_loss: 0.1434
Epoch 3/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9429 - loss: 0.1351 - val_accuracy: 0.9429 - val_loss: 0.1303
Epoch 4/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.9476 - loss: 0.1256 - val_accuracy: 0.9465 - val_loss: 0.1226
Epoch 5/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9514 - loss: 0.1161 - val_accuracy: 0.9461 - val_loss: 0.1191
Epoch 6/10
[1m2923/2923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9530 - loss: 0.1093 - val_accuracy: 0.9498 - val_loss: 0.1146
Epoch 7/10
[1m2923/2923

##**High-Performance Models**
*   Stacking with XGBoost, LightGBM, and RandomForest; A Logistic Regression Meta-model
*   Deep Neural Network (DNN); T4 GPU NEEDED!
*   XGBoost




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Step 3: Preprocessing
# Handling categorical columns (e.g., encoding)
train_df['Gender'] = train_df['Gender'].map({'Male': 1, 'Female': 0})

# Define feature and target variables
X = train_df.drop(['Id', 'satisfaction'], axis=1)
y = train_df['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the categorical columns
categorical_cols = ['Customer Type', 'Type of Travel', 'Class']

# Handle categorical features before scaling
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Step 4: Base Models
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.01)
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.01)
rf_model = RandomForestClassifier(
    n_estimators=100)

# Step 5: Stacking Classifier
estimators = [
    ('xgb', xgb_model),
    ('lgb', lgb_model),
    ('rf', rf_model)
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Train the stacking model
stacking_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred_stack = stacking_model.predict(X_val_scaled)
print(f"Stacking Model Validation Accuracy: {accuracy_score(y_val, y_val_pred_stack)}")

# Step 6: Build DNN Model
def build_dnn(input_shape):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    return model

# Compile the model
dnn_model = build_dnn(input_shape=X_train_scaled.shape[1])
dnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the DNN Model
dnn_model.fit(X_train_scaled, y_train, epochs=20, batch_size=128, validation_data=(X_val_scaled, y_val))

# Evaluate the DNN model on validation set
dnn_val_pred = (dnn_model.predict(X_val_scaled) > 0.5).astype("int32")
print(f"DNN Validation Accuracy: {accuracy_score(y_val, dnn_val_pred)}")

# Step 8: Train XGBoost with GPU Support
xgb_gpu_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.01
)
xgb_gpu_model.fit(X_train_scaled, y_train)

# Evaluate the XGBoost model
xgb_val_pred = xgb_gpu_model.predict(X_val_scaled)
print(f"XGBoost Validation Accuracy: {accuracy_score(y_val, xgb_val_pred)}")

# Step 9: Prepare test data
X_test = test_df.drop(['Id'], axis=1)
X_test['Gender'] = X_test['Gender'].map({'Male': 1, 'Female': 0})  # Adjust encoding if necessary

# Apply label encoding to categorical columns in the test data
for col in categorical_cols:
    le = LabelEncoder()
    X_test[col] = le.fit_transform(X_test[col]) # Encode categorical features in test data

X_test_scaled = scaler.transform(X_test)

# Step 10: Predictions for Submission
stacking_test_pred = stacking_model.predict(X_test_scaled)
dnn_test_pred = (dnn_model.predict(X_test_scaled) > 0.5).astype("int32")
xgb_test_pred = xgb_gpu_model.predict(X_test_scaled)

# Step 11: Create Submission Files
submission_stacking = pd.DataFrame({'ID': test_df['Id'], 'satisfaction': stacking_test_pred})
submission_dnn = pd.DataFrame({'ID': test_df['Id'], 'satisfaction': dnn_test_pred.flatten()})
submission_xgb = pd.DataFrame({'ID': test_df['Id'], 'satisfaction': xgb_test_pred})

# Save the submissions as CSV files
submission_stacking.to_csv("submission_stacking.csv", index=False)
submission_dnn.to_csv("submission_dnn.csv", index=False)
submission_xgb.to_csv("submission_xgb.csv", index=False)

print("Submissions saved successfully!")

##GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

print(train_df.columns.tolist())

# Data Cleaning
train_df_clean = train_df.drop(columns=['Id'])
test_df_clean = test_df.drop(columns=['Id'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

# Label Encoding for binary columns
train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

# Ordinal Encoding for ratings columns (e.g., Seat comfort, Online support)
ratings_cols = ['Seat comfort', 'Departure/Arrival time convenient', 'Food and drink',
                'Gate location', 'Inflight wifi service', 'Inflight entertainment',
                'Online support', 'Ease of Online booking', 'On-board service',
                'Leg room service', 'Baggage handling', 'Checkin service',
                'Cleanliness', 'Online boarding']
ordinal_encoder = OrdinalEncoder()
train_df_clean[ratings_cols] = ordinal_encoder.fit_transform(train_df_clean[ratings_cols])
test_df_clean[ratings_cols] = ordinal_encoder.transform(test_df_clean[ratings_cols])

# One-Hot Encoding for nominal categorical columns
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', drop='first') # remove sparse argument, add handle_unknown to avoid errors
encoded_train = one_hot_encoder.fit_transform(train_df_clean[categorical_cols])
encoded_test = one_hot_encoder.transform(test_df_clean[categorical_cols])

# Append encoded columns
train_df_clean = train_df_clean.join(pd.DataFrame.sparse.from_spmatrix(encoded_train, columns=one_hot_encoder.get_feature_names_out(categorical_cols))) # Use sparse.from_spmatrix to convert the sparse matrix to a DataFrame
test_df_clean = test_df_clean.join(pd.DataFrame.sparse.from_spmatrix(encoded_test, columns=one_hot_encoder.get_feature_names_out(categorical_cols))) # Use sparse.from_spmatrix to convert the sparse matrix to a DataFrame

# Drop the original categorical columns
train_df_clean = train_df_clean.drop(columns=categorical_cols)
test_df_clean = test_df_clean.drop(columns=categorical_cols)

# Define features and target
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Random Forest Classifier with hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search with cross-validation
cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(grid_search.best_params_)

# Make predictions on validation set
y_val_pred = grid_search.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Predictions for test set and submission
test_predictions = grid_search.predict(test_df_clean)
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_df.to_csv('submission.csv', index=False)

['Id', 'satisfaction', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient', 'Food and drink', 'Gate location', 'Inflight wifi service', 'Inflight entertainment', 'Online support', 'Ease of Online booking', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
Fitting 5 folds for each of 162 candidates, totalling 810 fits


##RandomizedSearchCV

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold

# Data Preprocessing
# Dropping the 'Id' column (not useful for modeling) from both train and test datasets
train_df_clean = train_df.drop(columns=['Id'])
test_df_clean = test_df.drop(columns=['Id'])

# Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
imputer = SimpleImputer(strategy='median')
train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

# Encode categorical variables using LabelEncoder
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Fit the LabelEncoder on the combined unique values from both train and test data
    le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
    train_df_clean[col] = le.transform(train_df_clean[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Encode the target variable 'satisfaction'
train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

# Separate features and target variable
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Split the data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomizedSearchCV with 3-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=50,  # Try 50 random combinations of hyperparameters
    cv=3,       # 3-fold cross-validation
    n_jobs=-1,  # Use all CPU cores
    verbose=2,
    random_state=42
)

# Fit the model on training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Evaluate on the validation set
y_val_pred = random_search.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = random_search.predict(test_df_clean)

# Prepare the submission file
submission_df = pd.DataFrame({
    'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
    'satisfaction': test_predictions
})

# Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save the submission file
submission_file_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/Black-Mesa-Survivors_08_RandomizedSearchCV.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"Submission file saved to: {submission_file_path}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters found by RandomizedSearchCV:
{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': False}
Validation Accuracy: 0.9581675862953933

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     10585
           1       0.97      0.95      0.96     12794

    accuracy                           0.96     23379
   macro avg       0.96      0.96      0.96     23379
weighted avg       0.96      0.96      0.96     23379

Submission file saved to: /content/drive/MyDrive/Code/DS_1101/Fly High With FDS/Black-Mesa-Survivors_08_RandomizedSearchCV.csv


##Full Random Forest Implementation with Extended Preprocessing and Feature Engineering


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# EDA and Data Visualization
def plot_feature_distribution(df, feature, title):
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature, data=df)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

# Data Preprocessing
def preprocess_data(train_df, test_df):
    # Drop 'Id' column as it is not useful for modeling
    train_df_clean = train_df.drop(columns=['Id'])
    test_df_clean = test_df.drop(columns=['Id'])

    # Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
    imputer = SimpleImputer(strategy='median')
    train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
    test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

    # Feature Engineering: Add new features like total flights or flight categories if possible (domain-specific knowledge)
    # Example: Binning 'Flight Distance' into categories (short, medium, long flights)
    train_df_clean['Flight Distance Category'] = pd.cut(train_df_clean['Flight Distance'],
                                                       bins=[0, 1000, 3000, np.inf],
                                                       labels=['short', 'medium', 'long'])
    test_df_clean['Flight Distance Category'] = pd.cut(test_df_clean['Flight Distance'],
                                                      bins=[0, 1000, 3000, np.inf],
                                                      labels=['short', 'medium', 'long'])

    # Encode categorical variables using LabelEncoder
    categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance Category']

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
        train_df_clean[col] = le.transform(train_df_clean[col])
        test_df_clean[col] = le.transform(test_df_clean[col])
        label_encoders[col] = le

    # Encode the target variable 'satisfaction'
    train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

    # Standardize features like 'Flight Distance', 'Departure Delay' and 'Arrival Delay'
    scaler = StandardScaler()
    numerical_cols = ['Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
    train_df_clean[numerical_cols] = scaler.fit_transform(train_df_clean[numerical_cols])
    test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

    return train_df_clean, test_df_clean

# Random Forest with Hyperparameter Tuning and Cross-Validation
def random_forest_model(X, y, X_test, test_df, output_path):
    # Split the data into training and validation sets (80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define Random Forest model
    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Hyperparameter tuning grid
    param_distributions = {
        'n_estimators': [100, 200, 300, 400],
        'max_depth': [10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Stratified K-Folds Cross-Validation
    skf = StratifiedKFold(n_splits=5)

    # Randomized Search CV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=rf_model,
        param_distributions=param_distributions,
        n_iter=50,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    # Fit the RandomizedSearchCV model
    random_search.fit(X_train, y_train)

    # Best hyperparameters
    print("Best hyperparameters found by RandomizedSearchCV:")
    print(random_search.best_params_)

    # Evaluate on the validation set
    y_val_pred = random_search.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")
    print("\nClassification Report on Validation Set:")
    print(classification_report(y_val, y_val_pred))

    # Confusion Matrix for better insight
    cm = confusion_matrix(y_val, y_val_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix on Validation Set')
    plt.show()

    # Feature Importance for potential feature selection
    importances = random_search.best_estimator_.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importance')
    plt.bar(range(X_train.shape[1]), importances[indices], align='center')
    plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
    plt.tight_layout()
    plt.show()

    # Select top features using SelectFromModel
    selector = SelectFromModel(random_search.best_estimator_, threshold="median")
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Refit model on selected features
    random_search.best_estimator_.fit(X_train_selected, y_train)

    # Make predictions on the test set with selected features
    test_predictions = random_search.best_estimator_.predict(X_test_selected)

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
        'satisfaction': test_predictions
    })

    # Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
    submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

    # Save the submission file
    submission_df.to_csv(output_path, index=False)
    print(f"Submission file saved to: {output_path}")

# Load data
train_df = pd.read_csv('/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/train.csv')

# Preprocess the data
train_df_clean, test_df_clean = preprocess_data(train_df, test_df)

# Separate features and target
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Run Random Forest Model
output_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/Black-Mesa-Survivors_11_RandomForest_Extra.csv'
random_forest_model(X, y, test_df_clean, test_df, output_path)

##Stratified K-Fold Cross-Validation

In [None]:
!pip install category_encoders
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
import warnings # Importing warnings library to ignore the warning in future.
warnings.filterwarnings("ignore") # Code to ignore warnings in future.

# Preprocessing function (same as before)
def preprocess_data(train_df, test_df):
    # Separate features and target variable
    X = train_df.drop(['satisfaction', 'Id'], axis=1)
    y = train_df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})
    X_test = test_df.drop('Id', axis=1)

    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', TargetEncoder())  # Using TargetEncoder
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Handle missing values in the target variable (y) before fitting
    y = y.fillna(y.mode()[0]) # Fill missing values with the mode

    # Fit and transform the training data
    X = preprocessor.fit_transform(X, y)

    # Transform the test data
    X_test = preprocessor.transform(X_test)

    return X, X_test, y


# Load and preprocess
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X, X_test, y = preprocess_data(train_df, test_df)

# Stratified K-Fold Cross-Validation
n_splits = 5  # Number of folds (adjust as needed)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
cv_scores = []
cv_classification_reports = []
test_predictions = [] # Initialize test predictions with size equal to no. of folds.

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Hyperparameter Tuning with RandomizedSearchCV inside each fold
    param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Explore a range
    'max_depth': [None, 10, 20, 30],         # Include None for full depth
    'min_samples_split': [2, 5, 10],        # Experiment with splitting criteria
    'min_samples_leaf': [1, 2, 4],          # Control leaf size
    'max_features': ['sqrt', 'log2', None], # Or a float between 0 and 1
    'bootstrap': [True, False],            # Bagging (with replacement) or Pasting
    'class_weight': [None, 'balanced', 'balanced_subsample']} # Adjust for class imbalance (if any)
# Hyperparameter grid for Random Forest

    rf = RandomForestClassifier(random_state=42)
    random_search = RandomizedSearchCV(
        rf, param_distributions=param_dist, n_iter=50,  # Increase n_iter for wider search
        scoring='accuracy', cv=5, n_jobs=-1, verbose=2, random_state=42
    )  # Inner cross-validation within each fold

    random_search.fit(X_train, y_train)

    best_rf = random_search.best_estimator_


    # Evaluate on validation fold
    y_pred = best_rf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    classification_rep = classification_report(y_val, y_pred)
    cv_scores.append(accuracy)
    cv_classification_reports.append(classification_rep)
    test_predictions.append(best_rf.predict_proba(X_test))

    print(f"Fold {fold+1} Accuracy: {accuracy}")
    print(f"Fold {fold+1} Classification Report:\n{classification_rep}\n")

# Summarize CV performance
print("Average cross-validation accuracy:", np.mean(cv_scores))
print("Standard deviation of cross-validation accuracy:", np.std(cv_scores))


# Create a submission based on the average of multiple models.
averaged_predictions = np.mean(test_predictions, axis = 0)
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': (averaged_predictions[:, 1] > 0.5).astype(int)
})

submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

submission_df.to_csv("averaged_model_submission.csv", index=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
