## Data Preparation Task (Mentoring 1 - ML Process)
---

In [49]:
import pandas as pd

### Load the Dataset

In [51]:
# Function named read_data 
def load_data(fname):
    """ 

    This function for import dataset from specific directory in the local computer 
    into pandas dataframe, for later to be trained into a Machine Learning Model.

    Parameter:
    - fname (string) = The Filename that we want to import and transform into dataframe.

    Return:
    - data (Dataframe) = a pandas Dataframe from import .csv file.

    Candra Kurniawan | Pacmann AI 2024.
    
    """
    # Import the data from local computer into dataframe
    # mencari file dengan kata belakang .xlsx
    if fname.endswith('.xlsx'):
        data = pd.read_excel(fname)

    # mencari file dengan kata belakang .csv
    elif fname.endswith('.csv'):
        data = pd.read_csv(fname, sep = ',')

    # jika tidak ada file xlsx dan csv maka akan Raise Error
    else:
        raise ValueError(f"File tidak ditemukan {fname} harus berbentuk .xlsx atau .csv")
    
    print(f"Data Shape: {data.shape}")

    return data

In [52]:
# Read the dataset
FNAME = '/Users/bytedance/CANDRA_MLPROCESS/data/raw/credit_risk_dataset.csv'
data = load_data(fname = FNAME)

Data Shape: (32581, 12)


In [53]:
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


### Split into input and output Data

In [54]:
def split_input_output(data, target_col):
    """
    This function for splitting data into input for train data (X) 
    and output for target/predict data (y).

    This function for splitting data into input for train data (X) 
    and output for target/predict data (y).

    Parameter : -> has two parameter
                1. data (pd.DataFrame)
                2. target_col (column pandas)
            -> Print the data.shape after splitting
            -> Then, Returning the value of X and y

    Returning : -> Data input that will be trained further in (X)
                -> And data target output that will be performed analysis/predict in (y).

    """
    # Splitting the data into input (X) and output (y)
    X = data.drop(target_col, axis = 1)
    y = data[target_col]

    # Print the shape of the data after splitting
    print(f"Original data shape: {data.shape}")
    print(f"X data shape: {X.shape}")
    print(f"y data shape: {y.shape}")

    return X, y


In [55]:
TARGET_COL = 'loan_status'

# call the split Function
X, y = split_input_output(data=data,
                          target_col=TARGET_COL)


Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


### Split into Train and Test Data

In [56]:
# Import train-test splitting library dari sklearn (scikit learn)
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size, random_state):
    """  
    This function for splitting the input and output data into
    Training, validation, and test dataset.

    Parameter : -> Has four Parameters:
                1. X = the input (pd.DataFrame)
                2. y = the output (pd.DataFrame)
                3. test_size = the test size between 0 - 1 (float)
                4. seed = the random state (int)
                5. stratify = This arguments is used for representative our 
                    imbalance output dataset. we set it (y) the output data.
            -> Print the data shape after splitting
            -> Then return the X_train, X_test, y_train, y_test

    Returning : X_train -> data input as (X) for training data
                X_test -> data input as (X) for test data
                y_train -> data output as (y) for training data
                y_test -> data output as (y) for test data
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = test_size, 
                                                        random_state = random_state,
                                                        stratify=y)
    return X_train, X_test, y_train, y_test

In [57]:
# Splitting the dataset into training set
# Then split again 10% into valid and test set

# First, split the train & not train
X_train, X_not_train, y_train, y_not_train = split_train_test(X, y, 0.2, 42)

# Then, split the valid & test
X_valid, X_test, y_valid, y_test = split_train_test(X_not_train, y_not_train, 0.5, 42)

print(f"X train shape: {X_train.shape}")
print(f"y train shape: {y_train.shape}")
print(f"X test shape: {X_not_train.shape}")
print(f"y test shape: {y_not_train.shape}")
print('')
print(f"X valid shape: {X_valid.shape}")
print(f"y valid shape: {y_valid.shape}")
print(f"X test shape: {X_test.shape}")
print(f"y test shape: {y_test.shape}")

X train shape: (26064, 11)
y train shape: (26064,)
X test shape: (6517, 11)
y test shape: (6517,)

X valid shape: (3258, 11)
y valid shape: (3258,)
X test shape: (3259, 11)
y test shape: (3259,)


### Serialize Data

In [59]:
import joblib

def serialize_data(data, path):
    """This function for serialize data, meaning of Serialize data for export.
    the Dataset/Model into Binary Data (Pickling) or Python Object (Unpickling)

    Parameter : -> Has two Parameter:
                1. data = the object that want to Serialize (Dataset/Model)
                2. path = the target path for placing or placed .pkl
                
            -> dump the object data into .pkl

    Returning : serialize -> the data that finish pickling into .pkl file
    
                X_train.pkl -> data input as (X) for training data
                X_test.pkl -> data input as (X) for test data
                X_valid.pkl -> data input as (X) for valid data
                y_train.pkl -> data output as (y) for training data
                y_test.pkl -> data output as (y) for test data
                y_valid.pkl -> data output as (y) for valid data

    """

    # Save the trained model to a file
    serialize = joblib.dump(data, path)

    return serialize

In [60]:
# Call the serialize function to dump the dataset into .pkl files

serialize_X_train = serialize_data(data=X_train,
                                  path='X_train.pkl')
serialize_y_train = serialize_data(data=y_train,
                               path='y_train.pkl')
serialize_x_test = serialize_data(data=X_test,
                               path='X_test.pkl')
serialize_y_test = serialize_data(data=y_test,
                               path='y_test.pkl')
serialize_x_valid = serialize_data(data=X_valid,
                               path='X_valid.pkl')
serialize_y_valid = serialize_data(data=y_valid,
                               path='y_valid.pkl')

In [36]:
# Another way to save to specific Directory
# Base directory path
base_path = '/Users/bytedance/CANDRA_MLPROCESS/src/'

# Filenames for each dataset
filenames = ['X_train.pkl', 'y_train.pkl', 'X_test.pkl', 'y_test.pkl', 'X_valid.pkl', 'y_valid.pkl']

# Data objects
data_objects = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test,
    'X_valid': X_valid,
    'y_valid': y_valid
}

# Loop through filenames and serialize each data object
for name in filenames:
    data_key = name.split('.')[0]  # Extract the key from the filename
    path = base_path + name        # Construct the full path
    serialize_data(data=data_objects[data_key], path=path)
    print(f"{name} exported.")     # Print confirmation message

X_train.pkl exported.
y_train.pkl exported.
X_test.pkl exported.
y_test.pkl exported.
X_valid.pkl exported.
y_valid.pkl exported.


### Deserialize Data (Unpickling)

In [61]:
def deserialize_data(path):
    """This function for deserialize data, meaning of unSerialize data is for import
    the Binary Object into the Python Object (Unpickling) it can be a Dataset or Model ML.

    Parameter : -> Has one Parameter:
                1. path = the target path for placing or placed .pkl
                
            -> import the object data into python data variable.

    Returning : deserialize -> a variable that contain the data.

    """

    # Save the trained model to a file
    data = joblib.load(path)

    return data

In [62]:
# Call the deserialize function to save the dataset into variable

deserialize_X_train = deserialize_data(path='X_train.pkl')
deserialize_y_train = deserialize_data(path='y_train.pkl')
deserialize_X_test = deserialize_data(path='X_test.pkl')
deserialize_y_test = deserialize_data(path='y_test.pkl')
deserialize_X_valid = deserialize_data(path='X_valid.pkl')
deserialize_y_valid = deserialize_data(path='y_valid.pkl')

In [63]:
# Test checking the new variable after Unpickling
deserialize_X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15884,25,241875,MORTGAGE,4.0,EDUCATION,A,16000,7.05,0.07,N,4
15138,21,18000,RENT,5.0,PERSONAL,B,1500,12.18,0.08,N,4
7474,25,53000,MORTGAGE,10.0,MEDICAL,B,16000,12.53,0.3,N,2
18212,28,16800,OWN,,MEDICAL,C,5000,13.98,0.3,N,8
6493,25,50000,MORTGAGE,2.0,VENTURE,A,10000,7.9,0.2,N,2
