# Data Preprocessing Stage
- Cleaning data with missing values
- Handling Outliers
- Scaling data
- Feature Encoding
- Feature Enginnering

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the training data
df = pd.read_csv('/home/xaris/Desktop/Projects/Introvert & Extrovert/Introvert-vs-Extrovert/Datasets/train.csv')
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [3]:
# Id is not giving any further information so i 'll drop it
df = df.drop('id',axis = 1)

In [4]:
df.shape

(18524, 8)

In [5]:
# Define the column categories by charachter in a list
numerical = ['Time_spent_Alone', 'Social_event_attendance','Going_outside','Friends_circle_size','Post_frequency']
categorical = ['Stage_fear','Drained_after_socializing']
target = ['Personality']

In [6]:
df.isnull().sum()

Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

# Clean missing data

In [7]:
# Imputing numerical missing values with mean value per target group
def imp_num(columns):
    for col in columns:
        df[col] = df[col].fillna(df.groupby('Personality')[col].transform('mean')) # or i could just do this with SimpleImputer
        
# Apply this function        
imp_num(numerical)

In [8]:
# Imputing categorical missing values with mode value per target group
from sklearn.impute import SimpleImputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical] = categorical_imputer.fit_transform(df[categorical])

# Handle outliers

In [9]:
# Handling Outliers on continuous data
for column in numerical:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column]=df[column].clip(lower = lower,upper = upper)  #clip is used to limit the values between lower and upper

# Encode and standard Scale data

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Encode Target variable
df['Personality'] = df['Personality'].replace({'Extrovert':1, 'Introvert':0})

# Build a preprocessor to standard scale continuous data and Ohe to categorical feature
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

# Preprocessor
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical),
         ("StandardScaler", numeric_transformer, numerical),        
    ]
)

In [12]:
X = df.drop('Personality',axis=1)
X= preprocessor.fit_transform(X)

In [16]:
X.shape

(18524, 9)

In [18]:
X

array([[ 1.        ,  0.        ,  1.        , ..., -0.00996788,
         1.68830295,  0.02078329],
       [ 1.        ,  0.        ,  1.        , ..., -0.5022836 ,
         0.48789312,  1.07656622],
       [ 0.        ,  1.        ,  1.        , ..., -1.97923076,
        -1.19268064, -1.73885492],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -1.48691504,
        -1.67284457, -1.17177926],
       [ 0.        ,  1.        ,  0.        , ..., -1.97923076,
        -0.71251671, -1.03499964],
       [ 1.        ,  0.        ,  1.        , ...,  0.97466355,
        -0.95259868,  0.72463858]], shape=(18524, 9))

# Feature Engineering