# PRACTICE: DATA PROCESSING FOR PREDICTING LOAN STATUS PROBLEM #

## STEP 1: IMPORT LIBRARIES ##

In [77]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

## STEP 2: DATASET INTRODUCTION

### REAL WORLD PROBLEM: Predicting loan status

### Problem Description

Based on information from students such as gender, education, loan amount to predict his/her loan status

### Objective

Build a Machine Learning model to predict a student's loan status

## STEP 3: LOAD AND READ DATA

### STEP 3.1. Load dataset

In [78]:
data = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
# Print first 5 rows of the dataset
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### STEP 3.2. Check dataset size

In [79]:
print("Information of the dataset:")
print('-' * 50)
print(f"Number of examples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")
print(f"Total number of cells: {data.shape[0] * data.shape[1]}")
print(f"Dataset shape: {data.shape}")

Information of the dataset:
--------------------------------------------------
Number of examples: 614
Number of features: 13
Total number of cells: 7982
Dataset shape: (614, 13)


## STEP 4: DATA EXPLORATION

### STEP 4.1. Data overview

In [80]:
# Data Overview 
print("DETAILED DATA INFORMATION")
print("=" * 50)

# Show DataFrame info
data.info()

# Explanation of results
print("\nEXPLANATION:")
print("• Non-Null Count: Number of non-missing values")
print("• Dtype: Data type (object = text, int64 = integer, float64 = float)")
print("• Memory usage: Memory used by the DataFrame")


DETAILED DATA INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB

EXPLANATION:
• Non-Null Count: Number of non-missing values
• Dtype: Data type (objec

### STEP 4.2. Descriptive statistics for numeric data

In [81]:
# Descriptive statistics for numeric columns 
print("DESCRIPTIVE STATISTICS FOR NUMERIC COLUMNS")
print("=" * 50)

# Compute descriptive statistics
numeric_stats = data.describe()
print(numeric_stats)

# Observations
print("\nObservations of some numeric features:")
print("• ApplicantIncome: range from 150 to 81000, mean ~5400, high variance ~6100")
print("• CoapplicantIncome: range from 0 to ~42000, mean ~1600")

DESCRIPTIVE STATISTICS FOR NUMERIC COLUMNS
       ApplicantIncome  CoapplicantIncome  ...  Loan_Amount_Term  Credit_History
count       614.000000         614.000000  ...         600.00000      564.000000
mean       5403.459283        1621.245798  ...         342.00000        0.842199
std        6109.041673        2926.248369  ...          65.12041        0.364878
min         150.000000           0.000000  ...          12.00000        0.000000
25%        2877.500000           0.000000  ...         360.00000        1.000000
50%        3812.500000        1188.500000  ...         360.00000        1.000000
75%        5795.000000        2297.250000  ...         360.00000        1.000000
max       81000.000000       41667.000000  ...         480.00000        1.000000

[8 rows x 5 columns]

Observations of some numeric features:
• ApplicantIncome: range from 150 to 81000, mean ~5400, high variance ~6100
• CoapplicantIncome: range from 0 to ~42000, mean ~1600


### STEP 4.3. Exploring categorical data

In [82]:
# Explore categorical data
print("CATEGORICAL DATA ANALYSIS")
print("=" * 50)

categorical_columns = data.select_dtypes(include='object').columns

for col in categorical_columns:
    print(f"\nColumn '{col}':")

    # Unique values
    unique_values = data[col].unique()
    # Show first 5 unique values
    print(f"  Unique values: {unique_values[:5]}")
    print(f"  Number of unique values: {len(unique_values)}")

    # Value counts and percentages
    print("  Distribution:")
    value_counts = data[col].value_counts()
    # Show first 5 distributions of a feature
    i = 0 
    for value, count in value_counts.items():
        i += 1
        percentage = (count / len(data)) * 100
        print(f"    • {value}: {count} ({percentage:.1f}%)")
        if i == 5: break

print("\nKEY OBSERVATIONS:")
print("• All categorical data needs to be converted to numeric")
print("• Output to be predicted: Y (yes) or N (no)")

CATEGORICAL DATA ANALYSIS

Column 'Loan_ID':
  Unique values: ['LP001002' 'LP001003' 'LP001005' 'LP001006' 'LP001008']
  Number of unique values: 614
  Distribution:
    • LP001002: 1 (0.2%)
    • LP001003: 1 (0.2%)
    • LP001005: 1 (0.2%)
    • LP001006: 1 (0.2%)
    • LP001008: 1 (0.2%)

Column 'Gender':
  Unique values: ['Male' 'Female' nan]
  Number of unique values: 3
  Distribution:
    • Male: 489 (79.6%)
    • Female: 112 (18.2%)

Column 'Married':
  Unique values: ['No' 'Yes' nan]
  Number of unique values: 3
  Distribution:
    • Yes: 398 (64.8%)
    • No: 213 (34.7%)

Column 'Dependents':
  Unique values: ['0' '1' '2' '3+' nan]
  Number of unique values: 5
  Distribution:
    • 0: 345 (56.2%)
    • 1: 102 (16.6%)
    • 2: 101 (16.4%)
    • 3+: 51 (8.3%)

Column 'Education':
  Unique values: ['Graduate' 'Not Graduate']
  Number of unique values: 2
  Distribution:
    • Graduate: 480 (78.2%)
    • Not Graduate: 134 (21.8%)

Column 'Self_Employed':
  Unique values: ['No' 'Yes'

## STEP 5: DATA PREPROCESSING

### OVERVIEW OBJECTIVE

Tranform raw data into ready-ML data
1. Split data into X (feature) and y (target)
2. Encode categorical data
3. Standardize numberic data

### Why we need to preprocess data before applying ML/DL model?

Machine Learning algorithms only understand numerical data. Howerver, raw data contains:
-  **Text data**: "Male", "Female", "Graduate", "Not Graduate"  
-  **Mixed scales**: ApplicantIncome (150-81000) vs CoapplicantIncome (0-42000)
-  **Different formats**: Need to unify format

After preprocessing:
-  **All numeric**: Ready-ML data
-  **Same scale**: No more range of values problem
-  **ML-ready**: Data can be fed into ML model directly

**Clean data**

In [83]:
# -----------------------------
# 1) Exlore the percentage of NaN values in all columns 
# -----------------------------
num_rows = data.shape[0]
print("Percentage of NaN values:")

for col in data.columns:
    percentage = data[col].isna().sum() / num_rows * 100
    print(f"Column {col}:\n {percentage:.2f}%")

Percentage of NaN values:
Column Loan_ID:
 0.00%
Column Gender:
 2.12%
Column Married:
 0.49%
Column Dependents:
 2.44%
Column Education:
 0.00%
Column Self_Employed:
 5.21%
Column ApplicantIncome:
 0.00%
Column CoapplicantIncome:
 0.00%
Column LoanAmount:
 3.58%
Column Loan_Amount_Term:
 2.28%
Column Credit_History:
 8.14%
Column Property_Area:
 0.00%
Column Loan_Status:
 0.00%


As the number of missing values is quite small compare to the number of examples, we will try to fill these missing values by there mean (with numerical data) and mode (with categorical data)

In [84]:
# -----------------------------
# 2) Handle missing values 
# -----------------------------
# Fill numeric columns with mean
num_cols = data.select_dtypes(include=[np.number]).columns            # Select numeric columns
for col in num_cols:
    data[col] = data[col].fillna(data[col].mean())                    # Fill missing values with median

# Categorical → điền mode / Fill categorical columns with mode
cat_cols = data.select_dtypes(exclude=[np.number]).columns            # Select object columns
for col in cat_cols:
    mode_val = data[col].mode().iloc[0]                               # Get most frequent value
    data[col] = data[col].fillna(mode_val)                            # Fill missing values with mode

print("=== AFTER HANDLING MISSING VALUES ===")
data.info()  # Show information of features of data after filling missing values

=== AFTER HANDLING MISSING VALUES ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [85]:
# -----------------------------
# 3) Remove duplicates 
# -----------------------------
before = data.shape[0]               # Rows before removing duplicates
data = data.drop_duplicates()        # Remove duplicate rows
after = data.shape[0]                # Rows after removing duplicates
print(f"Removed {before - after} duplicate rows.\n")

# -----------------------------
# Final cleaned data
# -----------------------------
print("As we can see, our data has no duplicate values")
print("=== FINAL CLEANED DATA ===")
print(data)

Removed 0 duplicate rows.

As we can see, our data has no duplicate values
=== FINAL CLEANED DATA ===
      Loan_ID  Gender Married  ... Credit_History Property_Area Loan_Status
0    LP001002    Male      No  ...            1.0         Urban           Y
1    LP001003    Male     Yes  ...            1.0         Rural           N
2    LP001005    Male     Yes  ...            1.0         Urban           Y
3    LP001006    Male     Yes  ...            1.0         Urban           Y
4    LP001008    Male      No  ...            1.0         Urban           Y
..        ...     ...     ...  ...            ...           ...         ...
609  LP002978  Female      No  ...            1.0         Rural           Y
610  LP002979    Male     Yes  ...            1.0         Rural           Y
611  LP002983    Male     Yes  ...            1.0         Urban           Y
612  LP002984    Male     Yes  ...            1.0         Urban           Y
613  LP002990  Female      No  ...            0.0     Semiurba

### STEP 5.1. Split data into X and y

**X:** Features used for prediction

**y:** Target to be predicted

In this section, we will utilized the method '.values', this method converts a DataFrame into a NumPy array as NumPy arrays are faster to process than DataFrame

In [86]:
# Create feature matrix X
print("=== SPLIT DATA INTO X AND Y ===")
print("=" * 50)

# Select feature columns and convert to NumPy array
# Drop 'Loan_ID' as it has no predictive power
X = data.drop(columns=['Loan_Status', 'Loan_ID']).values

print("=== Feature matrix info ===")
print(f"   Shape: {X.shape}")
print(f"   Data type: {X.dtype}")
print(f"   First 5 rows:")
# Display first 5 rows neatly
for i in range(5):
    print(f"   Student {i}: {X[i]}")

print("\nEXPLANATION:")
print("   • Each row = Information of a student")
print("   • Each column = 1 feature")
print("   • Columns 0, 1, 3, 4, 10 contain text → need encoding")

# Show directly for comparison
X[0:5]

=== SPLIT DATA INTO X AND Y ===
=== Feature matrix info ===
   Shape: (614, 11)
   Data type: object
   First 5 rows:
   Student 0: ['Male' 'No' '0' 'Graduate' 'No' 5849 0.0 146.41216216216216 360.0 1.0
 'Urban']
   Student 1: ['Male' 'Yes' '1' 'Graduate' 'No' 4583 1508.0 128.0 360.0 1.0 'Rural']
   Student 2: ['Male' 'Yes' '0' 'Graduate' 'Yes' 3000 0.0 66.0 360.0 1.0 'Urban']
   Student 3: ['Male' 'Yes' '0' 'Not Graduate' 'No' 2583 2358.0 120.0 360.0 1.0 'Urban']
   Student 4: ['Male' 'No' '0' 'Graduate' 'No' 6000 0.0 141.0 360.0 1.0 'Urban']

EXPLANATION:
   • Each row = Information of a student
   • Each column = 1 feature
   • Columns 0, 1, 3, 4, 10 contain text → need encoding


array([['Male', 'No', '0', 'Graduate', 'No', 5849, 0.0,
        146.41216216216216, 360.0, 1.0, 'Urban'],
       ['Male', 'Yes', '1', 'Graduate', 'No', 4583, 1508.0, 128.0, 360.0,
        1.0, 'Rural'],
       ['Male', 'Yes', '0', 'Graduate', 'Yes', 3000, 0.0, 66.0, 360.0,
        1.0, 'Urban'],
       ['Male', 'Yes', '0', 'Not Graduate', 'No', 2583, 2358.0, 120.0,
        360.0, 1.0, 'Urban'],
       ['Male', 'No', '0', 'Graduate', 'No', 6000, 0.0, 141.0, 360.0,
        1.0, 'Urban']], dtype=object)

### STEP 5.2. Label Encoding

As ML algorithms cannot understand categorical data, we need a way to encode each text value to a unique integer

In this section, we will use LabelEncoder from sklearn library to help to us do this task

In [87]:
print("=== LABEL ENCODING FOR CATEGORICAL DATA ===")
print("=" * 60)

X_df = data.drop(columns=['Loan_Status', 'Loan_ID'])
df = X_df.select_dtypes(include='object')

for col in df.columns:
    col_idx = X_df.columns.get_loc(col)
    # Initialize LabelEncoder
    scaler = preprocessing.LabelEncoder()
    # Fit - learn unique values
    le_scale = scaler.fit(X[:, col_idx])
    # Transform - convert to numbers
    X[:, col_idx] = le_scale.transform(X[:, col_idx])

# ===== FINAL RESULT =====
print(f"\n=== RESULT AFTER ENCODING ===")
print(f"   Shape of X: {X.shape}")
print(f"   First 5 rows:")
for i in range(5):
    print(f"   Student {i}: {X[i]}")

print(f"\nDONE: All text data have been converted to numbers!")

# Show final result
X[0:5]


=== LABEL ENCODING FOR CATEGORICAL DATA ===

=== RESULT AFTER ENCODING ===
   Shape of X: (614, 11)
   First 5 rows:
   Student 0: [1 0 0 0 0 5849 0.0 146.41216216216216 360.0 1.0 2]
   Student 1: [1 1 1 0 0 4583 1508.0 128.0 360.0 1.0 0]
   Student 2: [1 1 0 0 1 3000 0.0 66.0 360.0 1.0 2]
   Student 3: [1 1 0 1 0 2583 2358.0 120.0 360.0 1.0 2]
   Student 4: [1 0 0 0 0 6000 0.0 141.0 360.0 1.0 2]

DONE: All text data have been converted to numbers!


array([[1, 0, 0, 0, 0, 5849, 0.0, 146.41216216216216, 360.0, 1.0, 2],
       [1, 1, 1, 0, 0, 4583, 1508.0, 128.0, 360.0, 1.0, 0],
       [1, 1, 0, 0, 1, 3000, 0.0, 66.0, 360.0, 1.0, 2],
       [1, 1, 0, 1, 0, 2583, 2358.0, 120.0, 360.0, 1.0, 2],
       [1, 0, 0, 0, 0, 6000, 0.0, 141.0, 360.0, 1.0, 2]], dtype=object)

### STEP 5.3. Standardization - Numeric data scaling

Problem with different scales:

ML algoriths can bias features with larger values

**Solution:** Z-Score Normalization

Standardization converts data into normal distribution:
- **Mean (μ) = 0**: Average = 0
- **Standard Deviation (σ) = 1**: Standard deviation = 1

Benefits of standardization:
1. Balance: All features have equal importance
2. Speed: Algorithm converges faster
3. Accuracy: Better prediction result

In [88]:
# Standardize some numberic features: 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'
print("=== CHUẨN HÓA DỮ LIỆU SỐ / STANDARDIZATION ===")
print("=" * 60)

# Show data before standardization
# As we standardize many features, to keep it short, we only show 'ApplicantIncome' information
print("BEFORE STANDARDIZATION:")
print("Some information of feature 'ApplicantIncome'")
ApplicantIncome_original = X[:, 5].copy()  # Lưu bản gốc để so sánh / Keep original for comparison
print(f"   5 giá trị đầu / First 5 values: {ApplicantIncome_original[0:5]}")
print(f"   Giá trị min / Min: {ApplicantIncome_original.min():.2f}")
print(f"   Giá trị max / Max: {ApplicantIncome_original.max():.2f}")
print(f"   Trung bình / Mean: {ApplicantIncome_original.mean():.2f}")
print(f"   Độ lệch chuẩn / Std: {ApplicantIncome_original.std():.2f}")

scale_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in scale_features:
    # Step 1: Compute mean and std
    col_idx = X_df.columns.get_loc(col)
    mean = X[:, col_idx].mean()
    std = X[:, col_idx].std()

    print(f"\nCALCULATE PARAMETERS:")
    print(f"   μ (mean) = {mean:.4f}")
    print(f"   σ (std)  = {std:.4f}")

    # Step 2: Apply formula z = (x - μ)/σ
    print(f"\nAPPLY STANDARDIZATION:")
    print(f"   Formula: z = (x - {mean:.2f}) / {std:.2f}")

    X[:, col_idx] = (X[:, col_idx] - mean) / std

# Show data after standardization
# As we standardize many features, to keep it short, we only show 'ApplicantIncome' information
print(f"\nAFTER STANDARDIZATION:")
print("Some information of feature 'ApplicantIncome'")
print(f"   First 5 values: {X[0:5, 5]}")
print(f"   Min: {X[:, 5].min():.2f}")
print(f"   Max: {X[:, 5].max():.2f}")
print(f"   New mean: {X[:, 5].mean():.6f} ≈ 0")
print(f"   New std: {X[:, 5].std():.6f} ≈ 1")

# Compare before and after
print(f"\nSO SÁNH TRƯỚC VÀ SAU / BEFORE VS AFTER:")
print("Index | Before | After")
print("-" * 45)
for i in range(5):
    print(f"     {i}       |       {ApplicantIncome_original[i]:8.2f}       |    {X[i, 5]:8.2f}")

# Show final feature matrix X
print(f"\nMA TRẬN X SAU TIỀN XỬ LÝ HOÀN CHỈNH / FINAL X MATRIX AFTER PREPROCESSING:")
X[0:5]


=== CHUẨN HÓA DỮ LIỆU SỐ / STANDARDIZATION ===
BEFORE STANDARDIZATION:
Some information of feature 'ApplicantIncome'
   5 giá trị đầu / First 5 values: [5849 4583 3000 2583 6000]
   Giá trị min / Min: 150.00
   Giá trị max / Max: 81000.00
   Trung bình / Mean: 5403.46
   Độ lệch chuẩn / Std: 6104.06

CALCULATE PARAMETERS:
   μ (mean) = 5403.4593
   σ (std)  = 6104.0649

APPLY STANDARDIZATION:
   Formula: z = (x - 5403.46) / 6104.06

CALCULATE PARAMETERS:
   μ (mean) = 1621.2458
   σ (std)  = 2923.8645

APPLY STANDARDIZATION:
   Formula: z = (x - 1621.25) / 2923.86

CALCULATE PARAMETERS:
   μ (mean) = 146.4122
   σ (std)  = 83.9690

APPLY STANDARDIZATION:
   Formula: z = (x - 146.41) / 83.97

CALCULATE PARAMETERS:
   μ (mean) = 342.0000
   σ (std)  = 64.3200

APPLY STANDARDIZATION:
   Formula: z = (x - 342.00) / 64.32

AFTER STANDARDIZATION:
Some information of feature 'ApplicantIncome'
   First 5 values: [0.0729908228506884 -0.1344119537834513 -0.3937473372051284
 -0.4620624697931507 0

array([[1, 0, 0, 0, 0, 0.0729908228506884, -0.5544873301529847,
        3.3847857674402117e-16, 0.27985054320228187, 1.0, 2],
       [1, 1, 1, 0, 0, -0.1344119537834513, -0.03873154846446161,
        -0.21927331495275829, 0.27985054320228187, 1.0, 0],
       [1, 1, 0, 0, 1, -0.3937473372051284, -0.5544873301529847,
        -0.9576409986248733, 0.27985054320228187, 1.0, 2],
       [1, 1, 0, 1, 0, -0.4620624697931507, 0.25197960169153616,
        -0.31454656445883766, 0.27985054320228187, 1.0, 2],
       [1, 0, 0, 0, 0, 0.09772843680942071, -0.5544873301529847,
        -0.06445428450537935, 0.27985054320228187, 1.0, 2]], dtype=object)

### STEP 5.4. Handling Target Variable

**Create vector y from the 'Loan_Status' column**

In [89]:
# Create target vector y
print("=== CREATE TARGET VECTOR Y ===")
print("=" * 40)

# Take 'Loan_Status' column as target
y = data['Loan_Status']

print("TARGET INFORMATION:")
print(f"   Shape: {y.shape}")
print(f"   Data type: {y.dtype}")
print(f"   First 5 values: {list(y[0:5])}")

print(f"\nCLASS DISTRIBUTION:")
class_counts = y.value_counts()
for drug, count in class_counts.items():
    percentage = (count/len(y)) * 100
    print(f"   {drug}: {count} ({percentage:.1f}%)")

print(f"\nCOMMENTS:")
print(f"   • There are {len(class_counts)} different loan status")
print(f"   • This is a 2-class classification problem")

# Show values directly
y[0:5]


=== CREATE TARGET VECTOR Y ===
TARGET INFORMATION:
   Shape: (614,)
   Data type: object
   First 5 values: ['Y', 'N', 'Y', 'Y', 'Y']

CLASS DISTRIBUTION:
   Y: 422 (68.7%)
   N: 192 (31.3%)

COMMENTS:
   • There are 2 different loan status
   • This is a 2-class classification problem


0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

**Encoding the target variable**

Most ML algorithms require target variable as numberic, not text. Therefore, we need to encoding the target variable from text to integer

In [90]:
# Encoding target variable y
print("=== ENCODING TARGET VARIABLE Y / LABEL ENCODING TARGET Y ===")
print("=" * 50)

# Fit with all status
le_status = scaler.fit(y.unique())

print("LABEL ENCODER INFORMATION:")
print(f"   Classes learned: {le_status.classes_}")

# Hiển thị mapping / Show mapping
print(f"\nMAPPING TABLE:")
for i, status in enumerate(le_status.classes_):
    print(f"   {status} → {i}")

# Perform encoding
print(f"\nBEFORE ENCODING:")
print(f"   First 5 values: {list(y[0:5])}")

y_encoded = le_status.transform(y)

print(f"\nAFTER ENCODING:")
print(f"   First 5 values: {y_encoded[0:5]}")
print(f"   Data type: {y_encoded.dtype}")
print(f"   Shape: {y_encoded.shape}")

# Update y
y = y_encoded

print(f"\nCOMPLETED: Target y has been converted to numeric!")

# Check distribution after encoding
print(f"\nCLASS DISTRIBUTION AFTER ENCODING:")
unique, counts = np.unique(y, return_counts=True)
for val, count in zip(unique, counts):
    status_name = le_status.classes_[val]
    percentage = (count/len(y)) * 100
    print(f"   {val} ({status_name}): {count} ({percentage:.1f}%)")


=== ENCODING TARGET VARIABLE Y / LABEL ENCODING TARGET Y ===
LABEL ENCODER INFORMATION:
   Classes learned: ['N' 'Y']

MAPPING TABLE:
   N → 0
   Y → 1

BEFORE ENCODING:
   First 5 values: ['Y', 'N', 'Y', 'Y', 'Y']

AFTER ENCODING:
   First 5 values: [1 0 1 1 1]
   Data type: int64
   Shape: (614,)

COMPLETED: Target y has been converted to numeric!

CLASS DISTRIBUTION AFTER ENCODING:
   0 (N): 192 (31.3%)
   1 (Y): 422 (68.7%)


In [91]:
# Show final results
print("=== FINAL DATA AFTER PREPROCESSING ===")
print("=" * 50)

print("Feature matrix X:")
print(f"   Shape: {X.shape}")
print(f"   First 5 rows:")
for i in range(5):
    print(f"     Student {i}: {X[i]}")

print(f"\nVector y (Target) / Target vector y:")
print(f"   Shape: {y.shape}")
print(f"   First 5 values: {y[0:5]}")

print(f"\nDATA IS ML-READY!")
print(f"    All values are numeric")
print(f"    Data has been standardized")
print(f"    No missing values remaining")
print(f"    Format is ready for ML algorithms")

# Display y directly
y[0:5]


=== FINAL DATA AFTER PREPROCESSING ===
Feature matrix X:
   Shape: (614, 11)
   First 5 rows:
     Student 0: [1 0 0 0 0 0.0729908228506884 -0.5544873301529847 3.3847857674402117e-16
 0.27985054320228187 1.0 2]
     Student 1: [1 1 1 0 0 -0.1344119537834513 -0.03873154846446161 -0.21927331495275829
 0.27985054320228187 1.0 0]
     Student 2: [1 1 0 0 1 -0.3937473372051284 -0.5544873301529847 -0.9576409986248733
 0.27985054320228187 1.0 2]
     Student 3: [1 1 0 1 0 -0.4620624697931507 0.25197960169153616 -0.31454656445883766
 0.27985054320228187 1.0 2]
     Student 4: [1 0 0 0 0 0.09772843680942071 -0.5544873301529847 -0.06445428450537935
 0.27985054320228187 1.0 2]

Vector y (Target) / Target vector y:
   Shape: (614,)
   First 5 values: [1 0 1 1 1]

DATA IS ML-READY!
    All values are numeric
    Data has been standardized
    No missing values remaining
    Format is ready for ML algorithms


array([1, 0, 1, 1, 1])