<a href="https://colab.research.google.com/github/AthoyeMahmud/DS_1101/blob/main/Team_Black_Mesa_Survivors_Fly_High_With_FDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Load, Explore and Summerize

In [1]:
import pandas as pd

# Load the train, test, and sample submission datasets
train_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/train.csv'
test_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/test.csv'
submission_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(submission_path)

# Display basic information about the datasets
train_info = train_df.info()
test_info = test_df.info()

# Display first few rows of the train and test data
train_head = train_df.head()
test_head = test_df.head()

# Display the first few rows of the sample submission file
sample_submission_head = sample_submission_df.head()

train_info, train_head, test_info, test_head, sample_submission_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116892 entries, 0 to 116891
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Id                                 116892 non-null  int64  
 1   satisfaction                       116892 non-null  object 
 2   Gender                             116892 non-null  object 
 3   Customer Type                      116892 non-null  object 
 4   Age                                116892 non-null  int64  
 5   Type of Travel                     116892 non-null  object 
 6   Class                              116892 non-null  object 
 7   Flight Distance                    116892 non-null  int64  
 8   Seat comfort                       116892 non-null  int64  
 9   Departure/Arrival time convenient  116892 non-null  int64  
 10  Food and drink                     116892 non-null  int64  
 11  Gate location                      1168

(None,
        Id  satisfaction  Gender   Customer Type  Age   Type of Travel  \
 0   86347     satisfied    Male  Loyal Customer   50  Business travel   
 1  115822     satisfied    Male  Loyal Customer   51  Business travel   
 2   16351  dissatisfied    Male  Loyal Customer   14  Personal Travel   
 3  107284     satisfied  Female  Loyal Customer   52  Business travel   
 4    5788  dissatisfied  Female  Loyal Customer   26  Personal Travel   
 
       Class  Flight Distance  Seat comfort  Departure/Arrival time convenient  \
 0  Business             1548             5                                  5   
 1  Business             4020             5                                  5   
 2       Eco             2328             2                                  5   
 3  Business             3761             2                                  2   
 4       Eco             3348             1                                  5   
 
    ...  Online support  Ease of Online booking  On-b

##Preprocessing and Modeling

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Dropping the ID column (not useful for modeling) from both train and test
train_df_clean = train_df.drop(columns=['Id'])
test_df_clean = test_df.drop(columns=['Id'])

# Impute missing values in the 'Arrival Delay in Minutes' with median
imputer = SimpleImputer(strategy='median')
train_df_clean['Arrival Delay in Minutes'] = imputer.fit_transform(train_df_clean[['Arrival Delay in Minutes']])
test_df_clean['Arrival Delay in Minutes'] = imputer.transform(test_df_clean[['Arrival Delay in Minutes']])

# Encode categorical variables using LabelEncoder
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df_clean[col] = le.fit_transform(train_df_clean[col])
    test_df_clean[col] = le.transform(test_df_clean[col])
    label_encoders[col] = le

# Encode the target variable 'satisfaction'
train_df_clean['satisfaction'] = train_df_clean['satisfaction'].map({'satisfied': 1, 'dissatisfied': 0})

# Separate features and target variable
X = train_df_clean.drop(columns=['satisfaction'])
y = train_df_clean['satisfaction']

# Split the training data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.head(), y_train.head()

(       Gender  Customer Type  Age  Type of Travel  Class  Flight Distance  \
 94821       1              1   31               0      0             2702   
 83138       1              0   49               0      0             2301   
 7527        1              0   24               1      1             1215   
 15564       0              0   53               0      1              238   
 42362       1              0   26               0      0             3123   
 
        Seat comfort  Departure/Arrival time convenient  Food and drink  \
 94821             2                                  2               2   
 83138             2                                  2               2   
 7527              4                                  5               4   
 15564             5                                  2               2   
 42362             4                                  4               2   
 
        Gate location  ...  Online support  Ease of Online booking  \
 94821  

##Basic Random Forest Classifier Model Evaluation

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_model.predict(X_val)

# Evaluate the model performance
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

val_accuracy, val_classification_report

(0.9586380940159973,
 '              precision    recall  f1-score   support\n\n           0       0.94      0.97      0.95     10585\n           1       0.97      0.95      0.96     12794\n\n    accuracy                           0.96     23379\n   macro avg       0.96      0.96      0.96     23379\nweighted avg       0.96      0.96      0.96     23379\n')

##Prediction and Submission File Prep

In [5]:
# Make predictions on the test dataset
test_predictions = rf_model.predict(test_df_clean)

# Prepare the submission file
submission_df = pd.DataFrame({
    'ID': test_df['Id'],
    'satisfaction': test_predictions
})

# Convert satisfaction values back to original labels ('satisfied', 'dissatisfied')
submission_df['satisfaction'] = submission_df['satisfaction'].map({1: 'satisfied', 0: 'dissatisfied'})

# Save the submission file
submission_file_path = '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/sample_submission.csv'
submission_df.to_csv(submission_file_path, index=False)

submission_df.head(), submission_file_path

(       ID  satisfaction
 0   46587  dissatisfied
 1  124920     satisfied
 2   18490     satisfied
 3   78644  dissatisfied
 4   92713     satisfied,
 '/content/drive/MyDrive/Code/DS_1101/Fly High With FDS/sample_submission.csv')