In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib


In [2]:
def load_data(fname):
    """
    Load data from a CSV file and print its shape.

    Parameters:
    -----------
    fname : str
        The path to the CSV file.

    Returns:
    --------
    pandas.DataFrame
        The loaded data as a pandas DataFrame.
    """
    data = pd.read_csv(fname)
    print("Data Shape:", data.shape)
    return data


def split_input_output(data, target_col):
    """
    Split a DataFrame into input features (X) and target variable (Y).

    Parameters:
    -----------
    data : pandas.DataFrame
        The full dataset including features and target.
    target_col : str
        The name of the column to be used as the target variable.

    Returns:
    --------
    X : pandas.DataFrame
        The input features (all columns except the target).
    Y : pandas.DataFrame
        The target variable as a single-column DataFrame.
    """
    X = data.drop(columns=[target_col])
    Y = data[[target_col]]

    print('ORIGINAL Data Shape:', data.shape)
    print('X Data Shape:', X.shape)
    print('Y Data Shape:', Y.shape)

    return X, Y


def split_train_test(X, y, test_size, random_state=None):
    """
    Membagi dataset menjadi data pelatihan (train) dan pengujian (test) menggunakan stratifikasi.

    Parameters:
    -----------
    X : pandas.DataFrame
        Data fitur (independent variables) yang akan digunakan untuk pelatihan dan pengujian.
    
    y : pandas.Series or pandas.DataFrame
        Target variabel (dependent variable) yang akan diprediksi.
    
    test_size : float
        Proporsi data yang akan digunakan sebagai test set. Nilai antara 0 dan 1.
    
    random_state : int or None, optional (default=None)
        Angka acak untuk memastikan hasil pembagian yang konsisten (reproducible).

    Returns:
    --------
    X_train : pandas.DataFrame
        Data fitur untuk pelatihan.

    X_test : pandas.DataFrame
        Data fitur untuk pengujian.

    y_train : pandas.Series or pandas.DataFrame
        Target variabel untuk pelatihan.

    y_test : pandas.Series or pandas.DataFrame
        Target variabel untuk pengujian.
    """

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    print("X train shape:", X_train.shape)
    print("X test shape:", X_test.shape)
    print("y train shape:", y_train.shape)
    print("y test shape:", y_test.shape)

    return X_train, X_test, y_train, y_test


def serialize_data(data, path):
    """
    Serialize a Python object and save it to a file using joblib.

    Parameters:
    ----------
    data : Any
        The Python object to be serialized (e.g., a model, dataset, etc.).
    path : str
        The file path where the serialized object will be saved.

    Returns:
    -------
    None
        This function does not return anything. It saves the data to the specified file.
    """
    joblib.dump(data, path)



def deserialize_data(path):
    """
    Load (deserialize) a Python object from a file using joblib.

    Parameters:
    ----------
    path : str
        The file path where the serialized object is stored.

    Returns:
    -------
    data : Any
        The deserialized Python object retrieved from the file.
    """
    data = joblib.load(path)
    return data



In [3]:
FNAME = 'data/raw/credit_risk_dataset.csv'
data = load_data(FNAME)
data.head()

Data Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
TARGET_COL = 'loan_status'
X, y = split_input_output(data, TARGET_COL)

ORIGINAL Data Shape: (32581, 12)
X Data Shape: (32581, 11)
Y Data Shape: (32581, 1)


In [5]:
test_size = 0.2
random_state = 42
X_train, X_non_train, y_train, y_non_train = split_train_test(X, y, test_size, random_state=random_state)


test_size = 0.5
random_state = 42
X_valid ,X_test, y_valid, y_test = split_train_test(X_non_train, y_non_train, test_size, random_state=random_state)

X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064, 1)
y test shape: (6517, 1)
X train shape: (3258, 11)
X test shape: (3259, 11)
y train shape: (3258, 1)
y test shape: (3259, 1)


In [6]:
serialize_data(X_train, "X_train.pkl")
serialize_data(y_train, "y_train.pkl")
serialize_data(X_test, "x_test.pkl")
serialize_data(y_test, "y_test.pkl")
serialize_data(X_valid, "x_valid.pkl")
serialize_data(y_valid, "y_valid.pkl")