In [35]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import os

In [36]:
# Grab the directory of the current file
# Should be ./Project/src/Program
try:
    current_dir = Path(__file__).parent.absolute()
except NameError:
    # If using a Jupyter notebook
    try:
        current_dir = Path.cwd()
    except:
        # If continuing to fail set path manually
        current_dir = Path("D:/School/CS 434/Project/src/Program")

# Go to the Data directory 
# Should be ./Project/src/Data
data_dir = current_dir.parent / "Data"

# Get the Datasets path
dataset_path = data_dir / "Dataset.csv"

# Verify the file exists
if dataset_path.exists():
    print(f"Dataset found at: {dataset_path}")
    df = pd.read_csv(dataset_path)
else:
    print(f"Dataset not found at expected path: {dataset_path}")

Dataset found at: d:\School\CS 434\Project\src\Data\Dataset.csv


In [37]:
# Check for any duplicate rows and remove them
df = df.drop_duplicates()

# Convert numeric comlumns to appropriate types
numeric_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Categorical columns
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']

# Binary columns
binary_columns = ['Gender', 'Married', 'Self_Employed', 'Loan_Status']

# Dependents - convert to numeric where possible
def clean_dependents(value):
    if value == '3+':
        return 3
    try:
        return int(value)
    except ValueError:
        return np.nan

In [45]:
# Create first cleansed dataset - using median/mode imputation
df_imputed = df.copy()

# Impute missing numeric values with media
for col in numeric_columns:
    median_value = df_imputed[col].median()
    df_imputed[col].fillna(median_value, inplace=True)

# Impute missing categorical values with mode
for col in categorical_columns:
    mode_value = df_imputed[col].mode()[0]
    df_imputed[col].fillna(mode_value, inplace=True)

# Binary encoding where applicable
for col in binary_columns:
    df_imputed[col] = df_imputed[col].map({'Male': 1, 'Female': 0,
                                           'Yes': 1, 'No': 0,
                                           'Graduate': 1, 'Not Graduate': 0,
                                           'Y': 1, 'N': 0}).astype('int')
    
# For non-binary categorical columns, use one-hot encoding
# Propert_Area
df_imputed = pd.get_dummies(df_imputed, columns=['Property_Area'], drop_first=True)

# Dependents
df_imputed['Dependents'] = df_imputed['Dependents'].apply(clean_dependents)
df_imputed['Dependents'].fillna(df_imputed['Dependents'].median(), inplace=True)
df_imputed['Dependents'] = df_imputed['Dependents'].astype('int')

# Credit_History is already binary, just need to convert to int
df_imputed['Credit_History'] = df_imputed['Credit_History'].astype('int')

# Save the cleansed dataset to CSV file
df_imputed.to_csv(os.path.join(data_dir,r'loan_data_imputed.csv'), index=False)
print("Cleansed dataset saved to 'loan_data_imputed.csv'")

Cleansed dataset saved to 'loan_data_imputed.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [46]:
# Create second cleansed dataset - removing rows with missing values
df_removed = df.dropna()

# Encode categorical variables to binary/numeric
# Binary encoding where applicable
for col in binary_columns:
    df_removed[col] = df_removed[col].map({'Male': 1, 'Female': 0,
                                           'Yes': 1, 'No': 0,
                                           'Graduate': 1, 'Not Graduate': 0,
                                           'Y': 1, 'N': 0}).astype('int')
    
# For non-binary categorical columns, use one-hot encoding
# Propert_Area
df_removed = pd.get_dummies(df_removed, columns=['Property_Area'], drop_first=True)

# Dependents
df_removed['Dependents'] = df_removed['Dependents'].apply(clean_dependents)

# Credit History is already binary, just need to convert to int
df_removed['Credit_History'] = df_removed['Credit_History'].astype('int')

# Save the cleansed dataset to CSV file
df_removed.to_csv(os.path.join(data_dir,r'loan_data_removed.csv'), index=False)
print("Cleansed dataset with removed rows saved to 'loan_data_removed.csv'")

Cleansed dataset with removed rows saved to 'loan_data_removed.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed[col] = df_removed[col].map({'Male': 1, 'Female': 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed[col] = df_removed[col].map({'Male': 1, 'Female': 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed[col] = df_removed[col].map({'Male': 1, 'Female': 0,
A value is tryin

In [47]:
# Print summary of the cleaning
print(f"Original dataset shape: {df.shape}")
print(f"Imputed dataset shape: {df_imputed.shape}")
print(f"Removed dataset shape: {df_removed.shape}")

Original dataset shape: (367, 13)
Imputed dataset shape: (367, 14)
Removed dataset shape: (289, 14)
