In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Torch version: 2.5.1+cu121
CUDA available: True
Device: cuda
Using device: cuda


**Data Cleaning & Modeling Pipeline Plan**

1. Load and inspect data

2. Handle missing values

3. Feature engineering

4. Encode target

5. Train/test split

6. Baseline model

7. Evaluate and iterate



In [2]:
# Step 1: Load and inspect the data
import pandas as pd

# Load the data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Basic overview
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("\nTrain info:")
train.info()

# Missing values
print("\nMissing values in train:")
print(train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False))

# Preview target
print("\nTarget value counts:")
print(train['Personality'].value_counts(dropna=False))



Train shape: (18524, 9)
Test shape: (6175, 8)

Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB

Missing values in train:
Stage_fear                   1893
Going_outside                1466
Post_frequency               1264
Time_spent_Alone           

**Dataset Summary**
**Shapes:**

    train: 18,524 rows, 9 columns

    test: 6,175 rows, 8 columns (no target)

|Column|Missing|Type|
|--|--|--|
|`Stage_fear`|1,893|object|
|`Going_outside`|1,466|float|
|`Post_frequency`|1,264|float|
|`Time_spent_Alone`|1,190|float|
|`Social_event_attendance`|1,180|float|
|`Drained_after_socializing`|1,149|object|
|`Friends_circle_size`|1,054|float|

**Target distribution:**

 -   Extrovert: 13,699 (~74%)

 -   Introvert: 4,825 (~26%)

 -  **Imbalanced target**, something I'll need to handle during training

**Next Steps** (**Step 2** Plan: Clean the Data)

I'll handle missing values carefully based on our EDA findings:

1. Numerical Columns (float):

    - Impute using correlated features, KNN Imputer.

    - Use linear correlation-based fill when there's a strong relationship

    - These include: Time_spent_Alone, Social_event_attendance, Going_outside, Friends_circle_size, Post_frequency

2. Categorical Columns (object):

    - For Stage_fear and Drained_after_socializing, I observed they correlate with missingness in numeric fields

    - So I can fill them using related categorical/numeric values (like Going_outside, Post_frequency) grouped mode

3. Outlier handling (optional but worth flagging for later — I might revisit this during model tuning).

4. Encode categorical variables:

    - Stage_fear, Drained_after_socializing, and Personality (target)

5. Create a was_missing_* binary flag for imputed values

    - Always a good idea as it gives models a shot at capturing patterns related to why data was missing.

6. Save cleaned dataset for training reuse.


In [3]:
# Add binary flags for each column with missing data
missing_cols = [
    'Time_spent_Alone',
    'Stage_fear',
    'Social_event_attendance',
    'Going_outside',
    'Drained_after_socializing',
    'Friends_circle_size',
    'Post_frequency'
]

for col in missing_cols:
    train[f'{col}_missing'] = train[col].isna().astype(int)
    test[f'{col}_missing'] = test[col].isna().astype(int)


**Step 3: Fill Missing Values**

- Numeric Columns -> KNN Imputer

- Categorical Columns ->   


**Define Groups of Correlated Features**

In [4]:

# Logical groupings based on correlation structure
group_social_behavior = ['Time_spent_Alone', 'Going_outside', 'Drained_after_socializing', 'Stage_fear']
group_social_networking = ['Friends_circle_size', 'Post_frequency']


**KNN Imputer for Numerical Columns**

In [5]:
from sklearn.impute import KNNImputer

# Combine train and test for joint imputation (preserve indices to split later)
combined = pd.concat([train, test], keys=['train', 'test'])

# Use only the correlated numeric columns (drop categorical temporarily)
knn_impute_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
                   'Friends_circle_size', 'Post_frequency']

knn_imputer = KNNImputer(n_neighbors=5)

# Only impute selected numeric columns
combined[knn_impute_cols] = knn_imputer.fit_transform(combined[knn_impute_cols])

# Split back
train = combined.xs('train')
test = combined.xs('test')


**Predict Categorical Columns (Binary Classification)**

But first, before we forget again.

In [6]:
# Clean string-based binary columns while keeping NaNs intact
binary_map = {'Yes': 1, 'No': 0}
for col in ['Stage_fear', 'Drained_after_socializing']:
    train.loc[:, col] = train[col].apply(lambda x: binary_map[x] if x in binary_map else np.nan)
    test.loc[:, col] = test[col].apply(lambda x: binary_map[x] if x in binary_map else np.nan)


I'll predict missing values in `Stage_fear` and `Drained_after_socializing` by using logistic regression trained only on the rows with complete values:

In [None]:
from sklearn.linear_model import LogisticRegression

def impute_categorical(train_df, test_df, column, predictors):
    # Only use rows with known target
    known = train_df[train_df[column].notnull()]
    unknown = train_df[train_df[column].isnull()]

    model = LogisticRegression(max_iter=1000)
    model.fit(known[predictors], known[column].astype(int))

    # Predict and fill missing values in train
    train_df.loc[train_df[column].isnull(), column] = model.predict(unknown[predictors])

    # Predict and fill missing values in test
    test_df.loc[test_df[column].isnull(), column] = model.predict(test_df.loc[test_df[column].isnull(), predictors])

# Define predictor sets based on correlation analysis
predictors_stage_fear = ['Time_spent_Alone', 'Going_outside', 'Social_event_attendance']
predictors_drained = ['Time_spent_Alone', 'Stage_fear', 'Going_outside']

impute_categorical(train, test, 'Stage_fear', predictors_stage_fear)
impute_categorical(train, test, 'Drained_after_socializing', predictors_drained)


Let's take a moment to validate our janitorial efforts.

In [8]:
# 1. Confirm no missing values
print("Missing values in train:\n", train.isna().sum())
print("\nMissing values in test:\n", test.isna().sum())

# 2. Check dtypes and unique values of categorical (now numeric) columns
print("\nStage_fear unique:", train['Stage_fear'].unique())
print("Drained_after_socializing unique:", train['Drained_after_socializing'].unique())
print("Personality unique:", train['Personality'].unique())

# 3. Check target distribution
print("\nTarget value counts:")
print(train['Personality'].value_counts())

# 4. Sample preview
print("\nTrain sample:")
print(train.head())

# Show columns and data types
train.info()


Missing values in train:
 id                                   0
Time_spent_Alone                     0
Stage_fear                           0
Social_event_attendance              0
Going_outside                        0
Drained_after_socializing            0
Friends_circle_size                  0
Post_frequency                       0
Personality                          0
Time_spent_Alone_missing             0
Stage_fear_missing                   0
Social_event_attendance_missing      0
Going_outside_missing                0
Drained_after_socializing_missing    0
Friends_circle_size_missing          0
Post_frequency_missing               0
dtype: int64

Missing values in test:
 id                                      0
Time_spent_Alone                        0
Stage_fear                              0
Social_event_attendance                 0
Going_outside                           0
Drained_after_socializing               0
Friends_circle_size                     0
Post_frequency   

- spotless

---

Fix label formatting:

In [None]:
train = train.copy()
test = test.copy()

# Convert to int
train['Drained_after_socializing'] = train['Drained_after_socializing'].astype(int)
test['Drained_after_socializing'] = test['Drained_after_socializing'].astype(int)

train['Stage_fear'] = train['Stage_fear'].astype(int)
test['Stage_fear'] = test['Stage_fear'].astype(int)

# Encode labels
label_map = {'Introvert': 0, 'Extrovert': 1}
train['Personality'] = train['Personality'].map(label_map)
test['Personality'] = test['Personality'].map(label_map)

# Show columns and data types
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18524 entries, 0 to 18523
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 18524 non-null  int64  
 1   Time_spent_Alone                   18524 non-null  float64
 2   Stage_fear                         18524 non-null  int64  
 3   Social_event_attendance            18524 non-null  float64
 4   Going_outside                      18524 non-null  float64
 5   Drained_after_socializing          18524 non-null  int64  
 6   Friends_circle_size                18524 non-null  float64
 7   Post_frequency                     18524 non-null  float64
 8   Personality                        18524 non-null  int64  
 9   Time_spent_Alone_missing           18524 non-null  int64  
 10  Stage_fear_missing                 18524 non-null  int64  
 11  Social_event_attendance_missing    18524 non-null  int64  


In [None]:
# Save the cleaned datasets
train.to_csv("../data/cleaned_train.csv", index=False)
test.to_csv("../data/cleaned_test.csv", index=False)


---

Split the Training Set

- `X_train`, `X_valid`

- `y_train`, `y_valid`

For model evaluation before final test predictions.

In [10]:
X = train.drop(columns=['Personality', 'id'])  # keep ID only for post-pred join
y = train['Personality']

In [11]:
num_cols = [
    'Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
    'Friends_circle_size', 'Post_frequency'
]

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[num_cols] = scaler.fit_transform(X[num_cols])


In [13]:
X_scaled['Alone_x_Fear'] = X_scaled['Time_spent_Alone'] * X_scaled['Stage_fear']
X_scaled['Social_x_Drained'] = X_scaled['Social_event_attendance'] * X_scaled['Drained_after_socializing']


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)
y_proba = clf.predict_proba(X_val)[:, 1]

print(classification_report(y_val, y_pred))
print("AUC:", roc_auc_score(y_val, y_proba))


              precision    recall  f1-score   support

           0       0.94      0.92      0.93       965
           1       0.97      0.98      0.98      2740

    accuracy                           0.96      3705
   macro avg       0.96      0.95      0.95      3705
weighted avg       0.96      0.96      0.96      3705

AUC: 0.9547861276048561


Holy Crap! 95%