In [133]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
df_credit = pd.read_csv("german_credit_data_with_target.csv", index_col=0)

# Display basic info to confirm missing values
print(df_credit.info())


<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB
None


In [134]:
# Columns with missing data: 'Saving accounts', 'Checking account'
# Convert categorical columns to numeric using LabelEncoder for the imputation process
# Identify categorical columns that need encoding
categorical_columns = ['Sex', 'Housing', 'Purpose','Risk' , 'Saving accounts' ,'Checking account']
LabelEncoders={}
for columns in categorical_columns :
    # le = LabelEncoder()
    LabelEncoders[columns]=LabelEncoder()
# le_saving = LabelEncoder()
# le_checking = LabelEncoder()
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [135]:
# Handling missing data by converting NaNs to a string placeholder 'missing'
df_credit['Saving accounts'] = df_credit['Saving accounts'].fillna('missing')
df_credit['Checking account'] = df_credit['Checking account'].fillna('missing')

# Fit label encoders on all unique values including 'missing'
for columns in categorical_columns :
    df_credit[columns]=LabelEncoders[columns].fit_transform(df_credit[columns])
# df_credit['Saving accounts'] = LabelEncoders['Saving accounts'].fit_transform(df_credit['Saving accounts'])
# df_credit['Checking account'] = LabelEncoders['Checking account'].fit_transform(df_credit['Checking account'])
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,1,0,1169,6,5,1
1,22,0,2,1,0,2,5951,48,5,0
2,49,1,1,1,0,1,2096,12,3,1
3,45,1,2,0,0,0,7882,42,4,1
4,53,1,2,0,0,0,4870,24,1,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,1,0,1,1736,12,4,1
996,40,1,3,1,0,0,3857,30,1,1
997,38,1,2,1,0,1,804,12,5,1
998,23,1,2,0,0,0,1845,45,5,0


In [136]:
# Function for Predictive Imputation
def predictive_imputation(df, target_column, encoder):
    # Separate the data into rows with and without missing values
    missing_indicator = encoder.transform(['missing'])[0]
    df_missing = df[df[target_column] == missing_indicator]
    df_not_missing = df[df[target_column] != missing_indicator]
    
    if df_missing.empty:
        return df
    
    # Use all other features except the target for prediction
    X = df_not_missing.drop(columns=[target_column])
    y = df_not_missing[target_column]
    
    # Train a RandomForest model to predict missing values
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    
    # Predict the missing values
    X_missing = df_missing.drop(columns=[target_column])
    df.loc[df[target_column] == missing_indicator, target_column] = model.predict(X_missing)
    
    return df

In [137]:
# Impute missing values in 'Saving accounts'
df_credit = predictive_imputation(df_credit, 'Saving accounts', LabelEncoders['Saving accounts'])

# Impute missing values in 'Checking account'
df_credit = predictive_imputation(df_credit, 'Checking account',LabelEncoders['Checking account'])

In [138]:
# Convert encoded values back to original labels
# df_credit['Saving accounts'] = LabelEncoders['Saving accounts'].inverse_transform(df_credit['Saving accounts'].astype(int))
# df_credit['Checking account'] = LabelEncoders['Checking account'].inverse_transform(df_credit['Checking account'].astype(int))
for columns in categorical_columns :
    df_credit[columns] = LabelEncoders[columns].inverse_transform(df_credit[columns].astype(int))

# Display info to confirm imputation
print(df_credit.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   1000 non-null   object
 5   Checking account  1000 non-null   object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB
None


In [139]:
# If necessary, convert 'Saving accounts' and 'Checking account' back to categorical
df_credit['Saving accounts'] = df_credit['Saving accounts'].astype('category')
df_credit['Checking account'] = df_credit['Checking account'].astype('category')


for columns in categorical_columns:
    df_credit[columns] = df_credit[columns].astype('category')



In [140]:
# Save the imputed DataFrame to a new CSV file
df_credit.to_csv("GCD_with_no_empty_values.csv", index=True)

In [141]:
Cdf_credit = pd.read_csv("GCD_with_no_empty_values.csv",index_col=0)
Cdf_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,little,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,little,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,little,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,moderate,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad
