##Preprocessing and Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Dropping the ID column (not useful for modeling) from both train and test
train_df_clean = train_df.drop(columns=['Id'])
test_df_clean = test_df.drop(columns=['Id'])

# Impute missing values in the 'Arrival Delay in Minutes' with median
imputer = SimpleImputer(strategy='median')
train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

# Encode categorical variables using LabelEncoder
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df_clean[col] = le.fit_transform(train_df_clean[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Encode the target variable 'satisfaction'
train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

# Separate features and target variable
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Split the training data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.head(), y_train.head()

KeyError: 'satisfaction'

##Basic Random Forest Classifier Model Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_model.predict(X_val)

# Evaluate the model performance
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

val_accuracy, val_classification_report

(0.9586380940159973,
 '              precision    recall  f1-score   support\n\n           0       0.94      0.97      0.95     10585\n           1       0.97      0.95      0.96     12794\n\n    accuracy                           0.96     23379\n   macro avg       0.96      0.96      0.96     23379\nweighted avg       0.96      0.96      0.96     23379\n')

First attempt with Basic RFC 5/10/24; score - 0.97 ~ 0.988033

##Prediction and Submission File Prep

In [None]:
# Make predictions on the test dataset
test_predictions = rf_model.predict(test_df_clean)

# Prepare the submission file
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})

# Convert satisfaction values back to original labels ('satisfied', 'dissatisfied')
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save the submission file
submission_file_path = 'submission_RFC_reversed.csv'
submission_df.to_csv(submission_file_path, index=False)

submission_df.head(), submission_file_path

(       ID  satisfaction
 0   46587  dissatisfied
 1  124920     satisfied
 2   18490     satisfied
 3   78644  dissatisfied
 4   92713     satisfied,
 '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/submission.csv')

##Full Random Forest Implementation with Extended Preprocessing and Feature Engineering


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# EDA and Data Visualization
def plot_feature_distribution(df, feature, title):
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature, data=df)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

# Data Preprocessing
def preprocess_data(train_df, test_df):
    # Drop 'Id' column as it is not useful for modeling
    train_df_clean = train_df.drop(columns=['Id'])
    test_df_clean = test_df.drop(columns=['Id'])

    # Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
    imputer = SimpleImputer(strategy='median')
    train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
    test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

    # Feature Engineering: Add new features like total flights or flight categories if possible (domain-specific knowledge)
    # Example: Binning 'Flight Distance' into categories (short, medium, long flights)
    train_df_clean['Flight Distance Category'] = pd.cut(train_df_clean['Flight Distance'],
                                                       bins=[0, 1000, 3000, np.inf],
                                                       labels=['short', 'medium', 'long'])
    test_df_clean['Flight Distance Category'] = pd.cut(test_df_clean['Flight Distance'],
                                                      bins=[0, 1000, 3000, np.inf],
                                                      labels=['short', 'medium', 'long'])

    # Encode categorical variables using LabelEncoder
    categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance Category']

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
        train_df_clean[col] = le.transform(train_df_clean[col])
        test_df_clean[col] = le.transform(test_df_clean[col])
        label_encoders[col] = le

    # Encode the target variable 'satisfaction'
    train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

    # Standardize features like 'Flight Distance', 'Departure Delay' and 'Arrival Delay'
    scaler = StandardScaler()
    numerical_cols = ['Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
    train_df_clean[numerical_cols] = scaler.fit_transform(train_df_clean[numerical_cols])
    test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

    return train_df_clean, test_df_clean

# Random Forest with Hyperparameter Tuning and Cross-Validation
def random_forest_model(X, y, X_test, test_df, output_path):
    # Split the data into training and validation sets (80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define Random Forest model
    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Hyperparameter tuning grid
    param_distributions = {
        'n_estimators': [100, 200, 300, 400],
        'max_depth': [10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Stratified K-Folds Cross-Validation
    skf = StratifiedKFold(n_splits=5)

    # Randomized Search CV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=rf_model,
        param_distributions=param_distributions,
        n_iter=50,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    # Fit the RandomizedSearchCV model
    random_search.fit(X_train, y_train)

    # Best hyperparameters
    print("Best hyperparameters found by RandomizedSearchCV:")
    print(random_search.best_params_)

    # Evaluate on the validation set
    y_val_pred = random_search.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")
    print("\nClassification Report on Validation Set:")
    print(classification_report(y_val, y_val_pred))

    # Confusion Matrix for better insight
    cm = confusion_matrix(y_val, y_val_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix on Validation Set')
    plt.show()

    # Feature Importance for potential feature selection
    importances = random_search.best_estimator_.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importance')
    plt.bar(range(X_train.shape[1]), importances[indices], align='center')
    plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
    plt.tight_layout()
    plt.show()

    # Select top features using SelectFromModel
    selector = SelectFromModel(random_search.best_estimator_, threshold="median")
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Refit model on selected features
    random_search.best_estimator_.fit(X_train_selected, y_train)

    # Make predictions on the test set with selected features
    test_predictions = random_search.best_estimator_.predict(X_test_selected)

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
        'satisfaction': test_predictions
    })

    # Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
    submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

    # Save the submission file
    submission_df.to_csv(output_path, index=False)
    print(f"Submission file saved to: {output_path}")

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess the data
train_df_clean, test_df_clean = preprocess_data(train_df, test_df)

# Separate features and target
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Run Random Forest Model
output_path = 'Black-Mesa-Survivors_11_RandomForest_Extra.csv'
random_forest_model(X, y, test_df_clean, test_df, output_path)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

##XGBoost with GPU

In [None]:
# Install the GPU version of XGBoost if needed
!pip install xgboost

# Import required libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Data Preprocessing
def preprocess_data(train_df, test_df):
    # Drop 'Id' column as it is not useful for modeling
    train_df_clean = train_df.drop(columns=['Id'])
    test_df_clean = test_df.drop(columns=['Id'])

    # Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
    imputer = SimpleImputer(strategy='median')
    train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
    test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

    # Feature Engineering: Add new features like total flights or flight categories if possible (domain-specific knowledge)
    train_df_clean['Flight Distance Category'] = pd.cut(train_df_clean['Flight Distance'],
                                                       bins=[0, 1000, 3000, np.inf],
                                                       labels=['short', 'medium', 'long'])
    test_df_clean['Flight Distance Category'] = pd.cut(test_df_clean['Flight Distance'],
                                                      bins=[0, 1000, 3000, np.inf],
                                                      labels=['short', 'medium', 'long'])

    # Encode categorical variables using LabelEncoder
    categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance Category']

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
        train_df_clean[col] = le.transform(train_df_clean[col])
        test_df_clean[col] = le.transform(test_df_clean[col])
        label_encoders[col] = le

    # Encode the target variable 'satisfaction'
    train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

    # Standardize features like 'Flight Distance', 'Departure Delay' and 'Arrival Delay'
    scaler = StandardScaler()
    numerical_cols = ['Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
    train_df_clean[numerical_cols] = scaler.fit_transform(train_df_clean[numerical_cols])
    test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

    return train_df_clean, test_df_clean

# XGBoost with Hyperparameter Tuning
def xgboost_model(X, y, X_test, test_df, output_path):
    # Split the data into training and validation sets (80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define XGBoost model
    xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', random_state=42, use_label_encoder=False)

    # Hyperparameter tuning grid
    param_distributions = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'gamma': [0, 1, 5]
    }

    # Stratified K-Folds Cross-Validation
    skf = StratifiedKFold(n_splits=5)

    # Randomized Search CV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_distributions,
        n_iter=25,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    # Fit the RandomizedSearchCV model
    random_search.fit(X_train, y_train)

    # Best hyperparameters
    print("Best hyperparameters found by RandomizedSearchCV:")
    print(random_search.best_params_)

    # Evaluate on the validation set
    y_val_pred = random_search.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")
    print("\nClassification Report on Validation Set:")
    print(classification_report(y_val, y_val_pred))

    # Make predictions on the test set
    test_predictions = random_search.predict(X_test)

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
        'satisfaction': test_predictions
    })

    # Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
    submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

    # Save the submission file
    submission_df.to_csv(output_path, index=False)
    print(f"Submission file saved to: {output_path}")

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess the data
train_df_clean, test_df_clean = preprocess_data(train_df, test_df)

# Separate features and target
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Run XGBoost Model with GPU acceleration
output_path = 'Black-Mesa-Survivors_12_XGBoost_GPU.csv'
xgboost_model(X, y, test_df_clean, test_df, output_path)

Fitting 5 folds for each of 25 candidates, totalling 125 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



Best hyperparameters found by RandomizedSearchCV:
{'subsample': 0.8, 'n_estimators': 300, 'max_depth': 30, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.7}
Validation Accuracy: 0.9619744214893708

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     10585
           1       0.97      0.96      0.96     12794

    accuracy                           0.96     23379
   macro avg       0.96      0.96      0.96     23379
weighted avg       0.96      0.96      0.96     23379

Submission file saved to: Black-Mesa-Survivors_12_XGBoost_GPU.csv



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




##LightGBM with GPU

In [None]:
# Install the GPU version of LightGBM if necessary
!pip install lightgbm --upgrade

# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

# Data Preprocessing
def preprocess_data(train_df, test_df):
    # Drop 'Id' column as it is not useful for modeling
    train_df_clean = train_df.drop(columns=['Id'])
    test_df_clean = test_df.drop(columns=['Id'])

    # Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
    imputer = SimpleImputer(strategy='median')
    train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
    test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

    # Feature Engineering: Create flight distance categories
    train_df_clean['Flight Distance Category'] = pd.cut(train_df_clean['Flight Distance'],
                                                       bins=[0, 1000, 3000, np.inf],
                                                       labels=['short', 'medium', 'long'])
    test_df_clean['Flight Distance Category'] = pd.cut(test_df_clean['Flight Distance'],
                                                      bins=[0, 1000, 3000, np.inf],
                                                      labels=['short', 'medium', 'long'])

    # Encode categorical variables using LabelEncoder
    categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance Category']

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
        train_df_clean[col] = le.transform(train_df_clean[col])
        test_df_clean[col] = le.transform(test_df_clean[col])
        label_encoders[col] = le

    # Encode the target variable 'satisfaction'
    train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

    # Standardize numeric features
    scaler = StandardScaler()
    numerical_cols = ['Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
    train_df_clean[numerical_cols] = scaler.fit_transform(train_df_clean[numerical_cols])
    test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

    return train_df_clean, test_df_clean

# LightGBM with GPU support
def lightgbm_gpu_model(X, y, X_test, test_df, output_path):
    # Split the data into training and validation sets (80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define the LightGBM model using GPU acceleration
    lgb_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                   objective='binary',
                                   random_state=42,
                                   device='gpu')  # Specify GPU usage

    # Hyperparameter tuning grid for LightGBM
    param_distributions = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'num_leaves': [31, 64, 128]
    }

    # Stratified K-Folds Cross-Validation
    skf = StratifiedKFold(n_splits=5)

    # Randomized Search CV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        estimator=lgb_model,
        param_distributions=param_distributions,
        n_iter=20,  # Adjusted for faster processing
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    # Fit the RandomizedSearchCV model
    random_search.fit(X_train, y_train)

    # Best hyperparameters
    print("Best hyperparameters found by RandomizedSearchCV:")
    print(random_search.best_params_)

    # Evaluate on the validation set
    y_val_pred = random_search.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy: {val_accuracy}")
    print("\nClassification Report on Validation Set:")
    print(classification_report(y_val, y_val_pred))

    # Make predictions on the test set
    test_predictions = random_search.predict(X_test)

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
        'satisfaction': test_predictions
    })

    # Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
    submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

    # Save the submission file
    submission_df.to_csv(output_path, index=False)
    print(f"Submission file saved to: {output_path}")

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess the data
train_df_clean, test_df_clean = preprocess_data(train_df, test_df)

# Separate features and target
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Run LightGBM with GPU acceleration
output_path = 'Black-Mesa-Survivors_12_LightGBM_GPU.csv'
lightgbm_gpu_model(X, y, test_df_clean, test_df, output_path)



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 1284, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 955, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/engine.py", line 282, in train
    booster = Booster(params=params, train_set=train_set)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/basic.py", line 3641, in __init__
    _safe_call(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/basic.py", line 296, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: No OpenCL device found


##Full Implementation of cuML Random Forest Classifier

In [None]:
!nvidia-smi
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Mon Oct  7 15:28:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import cudf
import cuml
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Data Preprocessing Function
def preprocess_data(train_df, test_df):
    # Drop 'Id' column
    train_df_clean = train_df.drop(columns=['Id'])
    test_df_clean = test_df.drop(columns=['Id'])

    # Handle missing values (Impute 'Arrival Delay in Minutes' with the median)
    imputer = SimpleImputer(strategy='median')
    train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
    test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

    # Feature Engineering: Create flight distance categories
    train_df_clean['Flight Distance Category'] = pd.cut(train_df_clean['Flight Distance'],
                                                       bins=[0, 1000, 3000, np.inf],
                                                       labels=['short', 'medium', 'long'])
    test_df_clean['Flight Distance Category'] = pd.cut(test_df_clean['Flight Distance'],
                                                      bins=[0, 1000, 3000, np.inf],
                                                      labels=['short', 'medium', 'long'])

    # Encode categorical variables using LabelEncoder
    categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance Category']

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        le.fit(pd.concat([train_df_clean[col], test_df_clean[col]], axis=0).unique())
        train_df_clean[col] = le.transform(train_df_clean[col])
        test_df_clean[col] = le.transform(test_df_clean[col])
        label_encoders[col] = le

    # Encode the target variable 'satisfaction'
    train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

    # Standardize numeric features
    scaler = StandardScaler()
    numerical_cols = ['Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
    train_df_clean[numerical_cols] = scaler.fit_transform(train_df_clean[numerical_cols])
    test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

    return train_df_clean, test_df_clean

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess the data
train_df_clean, test_df_clean = preprocess_data(train_df, test_df)

# Convert pandas dataframes to cuDF dataframes (GPU DataFrames)
X = cudf.DataFrame(train_df_clean.drop(columns=['satisfaction']).astype('float32'))
y = cudf.Series(train_df_clean['satisfaction'])
X_test = cudf.DataFrame(test_df_clean)

# Split the data into training and validation sets (80-20 split) using cuML
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train cuML Random Forest Classifier
cu_rf_model = cuRF(n_estimators=300, max_depth=20, random_state=42)

# Fit the model
cu_rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = cu_rf_model.predict(X_val)

# Convert cuDF results to numpy for accuracy scoring
y_val_pred = y_val_pred.to_numpy() # Corrected method
y_val = y_val.to_numpy()         # Corrected method

# Evaluate performance on validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = cu_rf_model.predict(X_test)

# Ensure test_predictions is numpy array
test_predictions = test_predictions.to_numpy() # convert to numpy

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_df['Id'],  # Re-include 'Id' from the original test dataset
    'satisfaction': test_predictions
})

# Convert 'satisfaction' back to original labels ('satisfied' or 'dissatisfied')
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save the submission file
output_path = 'cuml_rf_submission.csv'
submission_df.to_csv(output_path, index=False)

print(f"Submission file saved to: {output_path}")

  return func(**kwargs)


Validation Accuracy: 0.9558559329283942

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10584
           1       0.97      0.95      0.96     12794

    accuracy                           0.96     23378
   macro avg       0.95      0.96      0.96     23378
weighted avg       0.96      0.96      0.96     23378

Submission file saved to: cuml_rf_submission.csv


##Tune-maxxing the Random Forest

In [2]:
!pip install category_encoders
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder

# Define the preprocess_data function
def preprocess_data(train_df, test_df):
    # Separate features and target variable
    X = train_df.drop(['satisfaction', 'Id'], axis=1)
    y = train_df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})
    X_test = test_df.drop('Id', axis=1)

    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', TargetEncoder())  # Using TargetEncoder
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Handle missing values in the target variable (y) before fitting
    y = y.fillna(y.mode()[0]) # Fill missing values with the mode

    # Fit and transform the training data
    X = preprocessor.fit_transform(X, y)

    # Transform the test data
    X_test = preprocessor.transform(X_test)

    return X, X_test, y

# Load and preprocess
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X, X_test, y = preprocess_data(train_df, test_df)  # Use the preprocessed data

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter Tuning with RandomizedSearchCV (More efficient than GridSearchCV)
param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Explore a range
    'max_depth': [None, 10, 20, 30],         # Include None for full depth
    'min_samples_split': [2, 5, 10],        # Experiment with splitting criteria
    'min_samples_leaf': [1, 2, 4],          # Control leaf size
    'max_features': ['sqrt', 'log2', None], # Or a float between 0 and 1
    'bootstrap': [True, False],            # Bagging (with replacement) or Pasting
    'class_weight': [None, 'balanced', 'balanced_subsample'] # Adjust for class imbalance (if any)
}



rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=50,
    scoring='accuracy', cv=5, n_jobs=-1, verbose=2, random_state=42
)

random_search.fit(X_train, y_train)



# Evaluate and Print Best Parameters
print("Best Hyperparameters:", random_search.best_params_)
best_rf = random_search.best_estimator_


y_pred = best_rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")
print(classification_report(y_val, y_pred))


# Train best model on full training data (if needed for final prediction on the test set). If not needed, skip and use random search for test predictions.

best_rf.fit(X, y) # Fit on the full training set


# Make predictions on the test set
test_predictions = best_rf.predict(X_test)

# Prepare submission
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})
submission_df.to_csv('random_forest_tuned_submission.csv', index=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'class_weight': 'balanced_subsample', 'bootstrap': False}
Validation Accuracy: 1.0
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     23379

    accuracy                           1.00     23379
   macro avg       1.00      1.00      1.00     23379
weighted avg       1.00      1.00      1.00     23379



##Stratified K-Fold Cross-Validation

In [1]:
!pip install category_encoders
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
import warnings # Importing warnings library to ignore the warning in future.
warnings.filterwarnings("ignore") # Code to ignore warnings in future.

# Preprocessing function (same as before)
def preprocess_data(train_df, test_df):
    # Separate features and target variable
    X = train_df.drop(['satisfaction', 'Id'], axis=1)
    y = train_df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})
    X_test = test_df.drop('Id', axis=1)

    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', TargetEncoder())  # Using TargetEncoder
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Handle missing values in the target variable (y) before fitting
    y = y.fillna(y.mode()[0]) # Fill missing values with the mode

    # Fit and transform the training data
    X = preprocessor.fit_transform(X, y)

    # Transform the test data
    X_test = preprocessor.transform(X_test)

    return X, X_test, y


# Load and preprocess
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X, X_test, y = preprocess_data(train_df, test_df)

# Stratified K-Fold Cross-Validation
n_splits = 5  # Number of folds (adjust as needed)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
cv_scores = []
cv_classification_reports = []
test_predictions = [] # Initialize test predictions with size equal to no. of folds.

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Hyperparameter Tuning with RandomizedSearchCV inside each fold
    param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Explore a range
    'max_depth': [None, 10, 20, 30],         # Include None for full depth
    'min_samples_split': [2, 5, 10],        # Experiment with splitting criteria
    'min_samples_leaf': [1, 2, 4],          # Control leaf size
    'max_features': ['sqrt', 'log2', None], # Or a float between 0 and 1
    'bootstrap': [True, False],            # Bagging (with replacement) or Pasting
    'class_weight': [None, 'balanced', 'balanced_subsample']} # Adjust for class imbalance (if any)
# Hyperparameter grid for Random Forest

    rf = RandomForestClassifier(random_state=42)
    random_search = RandomizedSearchCV(
        rf, param_distributions=param_dist, n_iter=50,  # Increase n_iter for wider search
        scoring='accuracy', cv=5, n_jobs=-1, verbose=2, random_state=42
    )  # Inner cross-validation within each fold

    random_search.fit(X_train, y_train)

    best_rf = random_search.best_estimator_


    # Evaluate on validation fold
    y_pred = best_rf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    classification_rep = classification_report(y_val, y_pred)
    cv_scores.append(accuracy)
    cv_classification_reports.append(classification_rep)
    test_predictions.append(best_rf.predict_proba(X_test))

    print(f"Fold {fold+1} Accuracy: {accuracy}")
    print(f"Fold {fold+1} Classification Report:\n{classification_rep}\n")

# Summarize CV performance
print("Average cross-validation accuracy:", np.mean(cv_scores))
print("Standard deviation of cross-validation accuracy:", np.std(cv_scores))


# Create a submission based on the average of multiple models.
averaged_predictions = np.mean(test_predictions, axis = 0)
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': (averaged_predictions[:, 1] > 0.5).astype(int) # Access second column (index 1) for probability of class 1.
})

submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

submission_df.to_csv("averaged_model_submission.csv", index=False)

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fold 1 Accuracy: 1.0
Fold 1 Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     23379

    accuracy                           1.00     23379
   macro avg       1.00      1.00      1.00     23379
weighted avg       1.00      1.00      1.00     23379


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fold 2 Accuracy: 1.0
Fold 2 Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     23379


KeyboardInterrupt: 