In [5]:
import pandas as pd
import numpy as ny
# Data loading
df = pd.read_csv(r"C:\Users\youse\my_project\credit_customers (1).csv")
print(df.shape)
pd.set_option("display.max_rows", 5)
df.head()
# basic stats
df.info()
df.describe(include="all")


(1000, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   checking_status         909 non-null    object
 1   duration                1000 non-null   int64 
 2   credit_history          961 non-null    object
 3   purpose                 957 non-null    object
 4   credit_amount           1000 non-null   int64 
 5   savings_status          987 non-null    object
 6   employment              987 non-null    object
 7   installment_commitment  1000 non-null   int64 
 8   personal_status         1000 non-null   object
 9   other_parties           1000 non-null   object
 10  residence_since         1000 non-null   int64 
 11  property_magnitude      942 non-null    object
 12  age                     1000 non-null   int64 
 13  other_payment_plans     1000 non-null   object
 14  housing                 931 non-null    object

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
count,909,1000.0,961,957,1000.00,987,987,1000.0,1000,1000,...,942,1000.0,1000,931,1000.0,1000,1000.0,1000,1000,1000
unique,4,,5,10,,5,5,,4,3,...,4,,3,3,,4,,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75%,,24.0,,,3972.25,,,4.0,,,...,,42.0,,,2.0,,1.0,,,
max,,72.0,,,18424.00,,,4.0,,,...,,75.0,,,4.0,,2.0,,,


In [None]:
# Missing values check
# check missing values (simple version)
def check_missing(df):
    for col in df.columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            percent = (missing_count / len(df)) * 100
            print(f"{col}: {missing_count} missing ({percent:.1f}%)")


check_missing(df)





checking_status: 91 missing (9.1%)
credit_history: 39 missing (3.9%)
purpose: 43 missing (4.3%)
savings_status: 13 missing (1.3%)
employment: 13 missing (1.3%)
property_magnitude: 58 missing (5.8%)
housing: 69 missing (6.9%)


In [None]:
# check duplicates
def check_duplicates(df):
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        print(f"There are {dup_count} duplicate rows.")
    else:
        print("No duplicates found.")

check_duplicates(df)


No duplicates found.


In [16]:
# IQR method
def check_outliers(df):
    for col in df.select_dtypes(include=[ny.number]).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = ((df[col] < lower) | (df[col] > upper)).sum()
        print(f"{col}: {outliers} outliers")


check_outliers(df)


duration: 70 outliers
credit_amount: 72 outliers
installment_commitment: 0 outliers
residence_since: 0 outliers
age: 23 outliers
existing_credits: 6 outliers
num_dependents: 155 outliers


In [20]:
#  imbalance
# check for all  columns
def check_balance_all(df):
    for col in df.select_dtypes(include=["object"]).columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts())
        print("Percentages:")
        print(df[col].value_counts(normalize=True) * 100)

# 
check_balance_all(df)





Column: checking_status
checking_status
no checking    361
<0             250
0<=X<200       240
>=200           58
Name: count, dtype: int64
Percentages:
checking_status
no checking    39.713971
<0             27.502750
0<=X<200       26.402640
>=200           6.380638
Name: proportion, dtype: float64

Column: credit_history
credit_history
existing paid                     511
critical/other existing credit    280
delayed previously                 83
all paid                           48
no credits/all paid                39
Name: count, dtype: int64
Percentages:
credit_history
existing paid                     53.173777
critical/other existing credit    29.136316
delayed previously                 8.636837
all paid                           4.994797
no credits/all paid                4.058273
Name: proportion, dtype: float64

Column: purpose
purpose
radio/tv               273
new car                220
furniture/equipment    171
                      ... 
domestic appliance      12

In [None]:
from sklearn.preprocessing import StandardScaler


def transform_data(df):
    data = df.copy()

    # fill missing
    for col in data.columns:
        if data[col].dtype == "object":   # categorical
            data[col].fillna(data[col].mode()[0], inplace=True)
        else:   # numeric
            data[col].fillna(data[col].mean(), inplace=True)


    data = pd.get_dummies(data, drop_first=True)


    scaler = StandardScaler()
    num_cols = data.select_dtypes(include=[ny.number]).columns
    data[num_cols] = scaler.fit_transform(data[num_cols])

    return data


df_transformed = transform_data(df)
df_transformed.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_<0,checking_status_>=200,checking_status_no checking,credit_history_critical/other existing credit,credit_history_delayed previously,credit_history_existing paid,credit_history_no credits/all paid,purpose_domestic appliance,purpose_education,purpose_furniture/equipment,purpose_new car,purpose_other,purpose_radio/tv,purpose_repairs,purpose_retraining,purpose_used car,savings_status_500<=X<1000,savings_status_<100,savings_status_>=1000,savings_status_no known savings,employment_4<=X<7,employment_<1,employment_>=7,employment_unemployed,personal_status_male div/sep,personal_status_male mar/wid,personal_status_male single,other_parties_guarantor,other_parties_none,property_magnitude_life insurance,property_magnitude_no known property,property_magnitude_real estate,other_payment_plans_none,other_payment_plans_stores,housing_own,housing_rent,job_skilled,job_unemp/unskilled non res,job_unskilled resident,own_telephone_yes,foreign_worker_yes,class_good
0,-1.236478,-0.745131,0.918477,1.046987,2.766456,1.027079,-0.42829,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True,False,False,True,True,False,True,False,True,False,False,True,True,True
1,2.248194,0.949817,-0.870183,-0.765977,-1.191404,-0.704926,-0.42829,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False,True,False,False,False,True,False
2,-0.738668,-0.416562,-0.870183,0.140505,1.183312,-0.704926,2.334869,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,True,True,False,True,False,False,False,True,False,True,True
3,1.750384,1.634247,-0.870183,1.046987,0.831502,-0.704926,2.334869,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,True,False,False,True,False,False,False,True,False,False,False,True,True
4,0.256953,0.566664,0.024147,1.046987,1.535122,1.027079,2.334869,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,True,False
