# IMPORTING LIBS

In [35]:
# Standard libraries
import pickle

# Data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# IMPORTING DATASE

In [36]:
# Train Set
try : 
    df = pd.read_csv('train.csv')
except :
    df = pd.read_csv('../data/raw/train.csv')
# Test Set
try : 
    test_df = pd.read_csv('train.csv')
except :
    test_df = pd.read_csv('../data/raw/test.csv')

In [37]:
print(df.head())
test_df.head()

   id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0   0               0.0         No                      6.0            4.0   
1   1               1.0         No                      7.0            3.0   
2   2               6.0        Yes                      1.0            0.0   
3   3               3.0         No                      7.0            3.0   
4   4               1.0         No                      4.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 15.0             5.0   Extrovert  
1                        No                 10.0             8.0   Extrovert  
2                       NaN                  3.0             0.0   Introvert  
3                        No                 11.0             5.0   Extrovert  
4                        No                 13.0             NaN   Extrovert  


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


In [38]:
print("Train nan Values")
print(df.isna().sum())
print("Test nan Values")
print(test_df.isna().sum())

Train nan Values
id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64
Test nan Values
id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64


In [39]:
# Drop ID column
df.drop(columns='id', inplace=True)
test_ids = test_df['id']
test_df.drop(columns='id', inplace=True)

In [40]:
# Separate features and target
target_col = 'Personality'
y = df[target_col]
X = df.drop(columns=target_col)

In [41]:
# Identify column types ===
cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [42]:
# Encode categorical features with LabelEncoder ===
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

In [43]:
# Encode the target column too
le_personality = LabelEncoder()
df['Personality'] = le_personality.fit_transform(df['Personality'])
label_encoders['Personality'] = le_personality

In [44]:
# Train-test split (for imputer and scaler) ===
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [45]:
# Fit scaler only on training data ===
scaler = StandardScaler()
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_val[num_features] = scaler.transform(X_val[num_features])
test_df[num_features] = scaler.transform(test_df[num_features])

In [46]:
# Apply KNN Imputer (fit only on train)
imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
test_imputed = imputer.transform(test_df)

In [47]:
# Convert back to DataFrames
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X.columns, index=X_train.index)
X_val_imputed = pd.DataFrame(X_val_imputed, columns=X.columns, index=X_val.index)
test_imputed = pd.DataFrame(test_imputed, columns=test_df.columns, index=test_df.index)

In [48]:
# Save features list to CSV ===
pd.DataFrame({
    'num_features': pd.Series(num_features),
    'cat_features': pd.Series(cat_features)
}).to_csv('feature_parameters.csv', index=False)

In [49]:
# Save preprocessors (scaler, label_encoders, imputer) ===
with open('../model/data_cleaning/preprocessing_pipeline.pkl', 'wb') as f:
    pickle.dump({
        'scaler': scaler,
        'label_encoders': label_encoders,  # includes target now
        'imputer': imputer,
        'num_features': num_features,
        'cat_features': cat_features
    }, f)

In [50]:
# Re-attach target column to training and validation sets
train_processed = X_train_imputed.copy()
train_processed['Personality'] = y_train

val_processed = X_val_imputed.copy()
val_processed['Personality'] = y_val

# Re-attach ID column to test set
test_processed = test_imputed.copy()
test_processed['id'] = test_ids.values

# Save to CSV
train_processed.to_csv('../data/cleaned/train_processed.csv', index=False)
val_processed.to_csv('../data/cleaned/val_processed.csv', index=False)
test_processed.to_csv('../data/cleaned/test_processed.csv', index=False)


In [51]:
train_processed.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
12652,1.62755,1.0,-1.915285,-0.508178,1.0,-1.419492,-1.384518,Introvert
5914,-0.0431,0.0,0.627765,1.434921,0.0,0.712747,-0.340069,Extrovert
5169,-1.045491,2.0,0.627765,0.949147,0.0,0.949663,0.35623,Extrovert
1501,1.62755,2.0,-1.188699,-1.479727,1.0,-1.182576,-1.036368,Introvert
4788,-0.0431,0.0,0.627765,0.463372,0.0,0.712747,1.748828,Extrovert


In [52]:
test_processed.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,id
0,-0.0431,0.0,0.627765,-0.022403,0.0,-0.47183,-0.409699,18524
1,1.761202,1.0,-1.915285,-1.965502,1.0,-0.708745,-1.384518,18525
2,-0.0431,0.0,-0.098821,0.949147,0.0,1.660409,1.400678,18526
3,-0.0431,0.0,-0.462113,-0.022403,0.0,-0.708745,0.35623,18527
4,1.96168,1.0,-1.551992,-0.993952,1.0,-1.656407,-1.384518,18528
