<a href="https://www.kaggle.com/code/marcelocruzeta/machine-learning-experiment?scriptVersionId=248533112" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
from sklearn.experimental import enable_iterative_imputer  # Needed to use IterativeImputer
from sklearn.impute import IterativeImputer  # For iterative imputation of missing values
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and validation sets
from sklearn.ensemble import RandomForestClassifier  # Importing the Random Forest classifier for model training
from sklearn.metrics import classification_report, accuracy_score  # For evaluating model performance metrics
from catboost import CatBoostClassifier  # Importing the CatBoost classifier for gradient boosting on decision trees
import xgboost as xgb  # Importing the XGBoost classifier
from collections import Counter
from tabulate import tabulate  # For displaying the DataFrame with borders

In [2]:
def preprocess(df):
    # Make a copy of the DataFrame
    df = df.copy()
    
    # Fill categorical variables and map them to numeric values
    df['Stage_fear'] = df['Stage_fear'].fillna('Unknown').map({'Yes': 1, 'No': 0, 'Unknown': -1})
    df['Drained_after_socializing'] = df['Drained_after_socializing'].fillna('Unknown').map({'Yes': 1, 'No': 0, 'Unknown': -1})
    
    # Select numeric columns for imputation
    num_cols = ['Time_spent_Alone', 'Social_event_attendance', 
                'Going_outside', 'Friends_circle_size', 
                'Post_frequency']
    
    # Use IterativeImputer to fill missing numeric values
    imputer = IterativeImputer(random_state=42)
    df[num_cols] = imputer.fit_transform(df[num_cols])
    
    return df

# Load the datasets
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

# Preprocess training data
X = preprocess(train.drop(['id', 'Personality'], axis=1))
y = train['Personality'].map({'Introvert': 0, 'Extrovert': 1})  # Target variable

# Preprocess test data
X_test = preprocess(test.drop('id', axis=1))

# Optionally print the first few rows of preprocessed data to verify
print(X.head())
print(y.head())
print(X_test.head())

   Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \
0               0.0           0                      6.0            4.0   
1               1.0           0                      7.0            3.0   
2               6.0           1                      1.0            0.0   
3               3.0           0                      7.0            3.0   
4               1.0           0                      4.0            4.0   

   Drained_after_socializing  Friends_circle_size  Post_frequency  
0                          0                 15.0        5.000000  
1                          0                 10.0        8.000000  
2                         -1                  3.0        0.000000  
3                          0                 11.0        5.000000  
4                          0                 13.0        5.664129  
0    1
1    1
2    0
3    1
4    1
Name: Personality, dtype: int64
   Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \

In [3]:
# Now proceed with the next steps
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Map back to original labels
test_predictions_labels = np.where(test_predictions == 0, 'Introvert', 'Extrovert')

# Create a DataFrame to save the results with descriptive labels
results = pd.DataFrame({'id': test['id'], 'Predicted_Personality': test_predictions_labels})

# Save to CSV
results.to_csv('result_r_forest.csv', index=False)


Accuracy: 0.9670715249662618
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       952
           1       0.97      0.98      0.98      2753

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [4]:
# Initialize the CatBoost model
catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, random_state=42, verbose=0)

# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = catboost_model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Make predictions on the test data
catboost_predictions = catboost_model.predict(X_test)

# Map back to original labels
catboost_predictions_labels = np.where(catboost_predictions == 0, 'Introvert', 'Extrovert')

# Create a DataFrame to save the results with descriptive labels
catboost_results = pd.DataFrame({'id': test['id'], 'Predicted_Personality': catboost_predictions_labels})

# Save to CSV
catboost_results.to_csv('result_c_boost.csv', index=False)  # Save results

Accuracy: 0.9673414304993252
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       952
           1       0.98      0.98      0.98      2753

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [5]:
# Initialize the XGBoost model
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit the model on the training data
xgboost_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = xgboost_model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Make predictions on the test data
xgboost_predictions = xgboost_model.predict(X_test)

# Map back to original labels
xgboost_predictions_labels = np.where(xgboost_predictions == 0, 'Introvert', 'Extrovert')

# Create a DataFrame to save the results with descriptive labels
xgboost_results = pd.DataFrame({'id': test['id'], 'Predicted_Personality': xgboost_predictions_labels})

# Save to CSV
xgboost_results.to_csv('result_xg_boost.csv', index=False)  # Save results

Accuracy: 0.9665317139001349
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       952
           1       0.97      0.98      0.98      2753

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [6]:
# Load the results from previous models
rf_results = pd.read_csv('/kaggle/working/result_r_forest.csv')  # Random Forest results
catboost_results = pd.read_csv('/kaggle/working/result_c_boost.csv')  # CatBoost results
xgboost_results = pd.read_csv('/kaggle/working/result_xg_boost.csv')  # XGBoost results

# Load the original feature data (test set)
test_data = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')  # Load test.csv

# Ensure all 'id' columns are of the same type (string)
rf_results['id'] = rf_results['id'].astype(str)
catboost_results['id'] = catboost_results['id'].astype(str)
xgboost_results['id'] = xgboost_results['id'].astype(str)
test_data['id'] = test_data['id'].astype(str)  # Ensure test data's 'id' is a string

# Rename the prediction columns for clarity
rf_results.rename(columns={'Predicted_Personality': 'RF'}, inplace=True)  # Rename for consistency
catboost_results.rename(columns={'Predicted_Personality': 'CB'}, inplace=True)  # Rename for consistency
xgboost_results.rename(columns={'Predicted_Personality': 'XGB'}, inplace=True)  # Rename for consistency

# Merge results on 'id'
comparison_df = rf_results.merge(catboost_results, on='id')
comparison_df = comparison_df.merge(xgboost_results, on='id')

# Define a function to determine majority voting
def majority_vote(row):
    predictions = [row['RF'], row['CB'], row['XGB']]
    return Counter(predictions).most_common(1)[0][0]  # Get the most common prediction

# Apply the majority vote function to each row
comparison_df['Final_Prediction'] = comparison_df.apply(majority_vote, axis=1)

# Create a new column to check if predictions differ
comparison_df['Divergence'] = (comparison_df['RF'] != comparison_df['Final_Prediction']) | \
                              (comparison_df['CB'] != comparison_df['Final_Prediction']) | \
                              (comparison_df['XGB'] != comparison_df['Final_Prediction'])

# Filter to keep only those rows where there is a divergence
divergent_predictions_df = comparison_df[comparison_df['Divergence']]

# Prepare features DataFrame
feature_columns = ['Time_spent_Alone', 'Stage_fear', 
                   'Social_event_attendance', 'Going_outside', 
                   'Drained_after_socializing', 'Friends_circle_size', 
                   'Post_frequency']  # Original feature names

# Create a features DataFrame from the test data
features_df = test_data[feature_columns].copy()
features_df.columns = ['TSA', 'SF', 'SEA', 'GO', 'DAS', 'FCS', 'PF']  # Rename to abbreviations

# Ensure 'id' column in features_df for merging
features_df['id'] = test_data['id'].astype(str)  # Add the id for merging

# Merge the feature columns with the divergent predictions DataFrame
divergent_predictions_df = divergent_predictions_df.merge(features_df, on='id', how='left')

# Print the DataFrame with divergent predictions formatted with borders
print("\nDivergent Predictions (with Feature Columns):")
print(tabulate(divergent_predictions_df[['id', 'RF', 'CB', 'XGB', 'TSA', 'SF', 'SEA', 'GO', 'DAS', 'FCS', 'PF']], headers='keys', tablefmt='grid', showindex=False))


Divergent Predictions (with Feature Columns):
+-------+-----------+-----------+-----------+-------+------+-------+------+-------+-------+------+
|    id | RF        | CB        | XGB       |   TSA | SF   |   SEA |   GO | DAS   |   FCS |   PF |
| 18753 | Introvert | Extrovert | Extrovert |   nan | No   |     9 |    6 | No    |     6 |    3 |
+-------+-----------+-----------+-----------+-------+------+-------+------+-------+-------+------+
| 18876 | Extrovert | Extrovert | Introvert |    10 | No   |   nan |    6 | No    |    10 |    7 |
+-------+-----------+-----------+-----------+-------+------+-------+------+-------+-------+------+
| 18973 | Introvert | Introvert | Extrovert |     9 | Yes  |     2 |    0 | Yes   |   nan |    2 |
+-------+-----------+-----------+-----------+-------+------+-------+------+-------+-------+------+
| 18977 | Extrovert | Introvert | Extrovert |     5 | No   |     9 |    7 | No    |    11 |    3 |
+-------+-----------+-----------+-----------+-------+------+--