# 2. Preprocessing

In this notebook, we preprocess the data to prepare it for the model.

In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd

from src.preprocessing import create_preprocessing_pipeline, train_val_split

## Loading data

In [3]:
df = pd.read_csv('../data/raw/loan-data.csv')
df.drop("Loan_ID", axis=1, inplace=True)

## Preprocessing

In [4]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [5]:
num_features = ['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
cat_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Credit_History']

print(num_features)
print(cat_features)

['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Credit_History']


**Converting "Dependents" feature to numeric**

The numner of dependents will be converted to a numeric value. 
+3 will be mapped to 3. This we avoid one-hot encoding while preserving information on the order of the values.

In [6]:
display(df.Dependents.value_counts())
df.Dependents.replace('3+', 3, inplace=True)

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

**Creating the preprocessing pipeline**

In [7]:
preprocessor = create_preprocessing_pipeline(num_features, cat_features)

**Encoding target variable**

In [8]:
# 0 = No, 1 = Yes
df['Loan_Status'] = df['Loan_Status'].apply(
    lambda x: 1 if x == 'Y' else 0
)

**Invoking the preprocessing pipeline**

In [9]:
df = preprocessor.fit_transform(df)
df

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History_1.0,Loan_Status
0,-0.827104,0.544331,-1.102837,-0.149985,0.175540,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
1,0.854259,0.170974,0.750578,-0.019602,0.175540,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
2,-0.827104,-0.499955,-1.102837,-1.335521,0.175540,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1
3,-0.827104,-0.743873,0.891686,-0.149985,0.175540,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1
4,-0.827104,0.582817,-1.102837,0.176671,0.175540,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.827104,-0.554885,-1.102837,-1.192913,0.175540,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
610,1.522278,-0.000290,-1.102837,-2.295908,-2.312879,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1
611,0.854259,1.023487,0.208603,1.384403,0.175540,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1
612,1.316702,0.931750,-1.102837,0.755379,0.175540,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1


**Split into train and validation sets**

In [10]:
train_df, val_df = train_val_split(
    df, val_size=0.15
)

**Checking for nulls**

In [12]:
df.isna().sum()

Dependents                 0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Gender_Male                0
Married_Yes                0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Rural        0
Property_Area_Semiurban    0
Property_Area_Urban        0
Credit_History_1.0         0
Loan_Status                0
dtype: int64

## Save the preprocessed data

In [11]:
train_df.to_csv('../data/processed/train.csv', index=False)
val_df.to_csv('../data/processed/val.csv', index=False)