# Data Preprocessing Stage
- Cleaning data with missing values
- Handling Outliers
- Scaling data
- Feature Encoding


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the training data
df = pd.read_csv('/home/xaris/Desktop/Projects/Introvert & Extrovert/Introvert-vs-Extrovert/Datasets/train.csv')
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [None]:
# Create a function in order to pass it in the next stage
def preprocess(df):
    # Id is not giving any further information so i 'll drop it
    df = df.drop('id',axis = 1)
    

In [None]:
# Id is not giving any further information so i 'll drop it
df = df.drop('id',axis = 1)

In [None]:
df.shape

(18524, 8)

In [None]:
# Define the column categories by charachter in a list
numerical = ['Time_spent_Alone', 'Social_event_attendance','Going_outside','Friends_circle_size','Post_frequency']
categorical = ['Stage_fear','Drained_after_socializing']
target = ['Personality']

In [None]:
df.isnull().sum()

Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

# Clean missing data

In [7]:
import pandas as pd

class DataPreprocessingPipeline:
    def __init__(self, df, numerical, categorical, target='Personality'):
        self.df = df.copy()
        self.numerical = numerical
        self.categorical = categorical
        self.target = target

    def drop_id(self):
        if 'id' in self.df.columns:
            self.df.drop('id', axis=1, inplace=True)
        return self.df

    def impute_numerical(self):
        for col in self.numerical:
            self.df[col] = self.df[col].fillna(
                self.df.groupby(self.target)[col].transform('mean')
            )
        return self.df
    
    def impute_categorical(self):
        for col in self.categorical:
            self.df[col] = self.df[col].fillna(
                self.df.groupby(self.target)[col].transform('mode')
            )
        return self.df

    def handle_outliers(self):
        for column in self.numerical:
            Q1 = self.df[column].quantile(0.25)
            Q3 = self.df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            self.df[column] = self.df[column].clip(lower=lower, upper=upper)
        return self.df

    def encode_target(self):
        self.df[self.target] = self.df[self.target].replace({'Extrovert': 1, 'Introvert': 0})
        return self.df

    
    def run_pipeline(self):
        """Runs all preprocessing steps in order."""
        self.drop_id()
        self.impute_numerical()
        self.handle_outliers()
        self.encode_target()
        return self.df

In [None]:
# Imputing numerical missing values with mean value per target group
def imp_num(df,columns):
    # Id is not giving any further information so i 'll drop it
    df = df.drop('id',axis = 1)
    for col in columns:
        df[col] = df[col].fillna(df.groupby('Personality')[col].transform('mean')) # or i could just do this with SimpleImputer
        
# Apply this function        
imp_num(numerical)

In [None]:
# Imputing categorical missing values with mode value per target group
from sklearn.impute import SimpleImputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical] = categorical_imputer.fit_transform(df[categorical])

# Handle outliers

In [None]:
# Handling Outliers on continuous data
for column in numerical:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column]=df[column].clip(lower = lower,upper = upper)  #clip is used to limit the values between lower and upper

# Encode and standard Scale data

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Encode Target variable
df['Personality'] = df['Personality'].replace({'Extrovert':1, 'Introvert':0})

# Build a preprocessor to standard scale continuous data and Ohe to categorical feature
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

# Preprocessor
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical),    
    ],
    remainder = 'passthrough' # keep any columns not listed
)

In [None]:
X = df.drop('Personality',axis=1)
y = df['Personality']

# Fit and transform
X_transformed = preprocessor.fit_transform(X)

In [None]:
# Get encoded categorical feature names
ohe_feature_names = preprocessor.named_transformers_['OneHotEncoder'].get_feature_names_out(categorical)

# Combine them with numeric features
final_feature_names = list(ohe_feature_names) + numerical

# Convert to DataFrame
X_transformed = pd.DataFrame(X_transformed, columns=final_feature_names)

In [None]:
df_processed = pd.concat([X_transformed, y.reset_index(drop=True)], axis=1)


In [None]:
df_processed.shape

(18524, 10)

In [None]:
df_processed.head()

Unnamed: 0,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_No,Drained_after_socializing_Yes,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Personality
0,1.0,0.0,1.0,0.0,0.0,6.0,4.0,15.0,5.0,1
1,1.0,0.0,1.0,0.0,1.0,7.0,3.0,10.0,8.0,1
2,0.0,1.0,1.0,0.0,6.0,1.0,0.0,3.0,0.0,0
3,1.0,0.0,1.0,0.0,3.0,7.0,3.0,11.0,5.0,1
4,1.0,0.0,1.0,0.0,1.0,4.0,4.0,13.0,6.113682,1
