In [1]:
import pandas as pd

df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,Age,Gender,Education,Income,Debt,Credit_Score,Loan_Amount,Loan_Term,Num_Credit_Cards,Payment_History,Employment_Status,Residence_Type,Marital_Status,Creditworthiness
0,56,Female,Master,149406,34089,581,49200,60,4,Bad,Unemployed,Rented,Single,1
1,69,Female,High School,78896,8626,648,20147,24,7,Good,Employed,Mortgaged,Married,1
2,46,Female,Master,119339,46281,329,41307,12,8,Bad,Unemployed,Owned,Single,1
3,32,Male,High School,131067,29403,816,19019,60,8,Bad,Employed,Owned,Single,1
4,60,Male,PhD,38001,30032,673,16317,36,4,Average,Employed,Rented,Married,0


In [5]:
df.shape

(12000, 14)

In [7]:

df.info()
df.describe()

<class 'pandas.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                12000 non-null  int64
 1   Gender             12000 non-null  str  
 2   Education          12000 non-null  str  
 3   Income             12000 non-null  int64
 4   Debt               12000 non-null  int64
 5   Credit_Score       12000 non-null  int64
 6   Loan_Amount        12000 non-null  int64
 7   Loan_Term          12000 non-null  int64
 8   Num_Credit_Cards   12000 non-null  int64
 9   Payment_History    12000 non-null  str  
 10  Employment_Status  12000 non-null  str  
 11  Residence_Type     12000 non-null  str  
 12  Marital_Status     12000 non-null  str  
 13  Creditworthiness   12000 non-null  int64
dtypes: int64(8), str(6)
memory usage: 1.3 MB


Unnamed: 0,Age,Income,Debt,Credit_Score,Loan_Amount,Loan_Term,Num_Credit_Cards,Creditworthiness
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,43.48925,85222.009417,25354.67325,573.967583,25113.95125,35.943,4.985833,0.702583
std,14.908982,37640.043409,14087.390846,157.816052,14247.092051,16.965162,2.593458,0.45714
min,18.0,20026.0,1001.0,300.0,501.0,12.0,1.0,0.0
25%,31.0,52400.75,13161.0,437.0,12725.0,24.0,3.0,0.0
50%,43.0,85756.0,25382.5,574.0,25220.5,36.0,5.0,1.0
75%,56.0,117754.75,37497.5,709.0,37348.0,48.0,7.0,1.0
max,69.0,149984.0,49995.0,849.0,49996.0,60.0,9.0,1.0


In [None]:
df["Creditworthiness"].value_counts(normalize=True)

Creditworthiness
1    0.702583
0    0.297417
Name: proportion, dtype: float64

In [9]:
for col in ["Gender", "Education", "Payment_History", "Employment_Status",
            "Residence_Type", "Marital_Status"]:
    print(col, "->", df[col].unique())

Gender -> <StringArray>
['Female', 'Male']
Length: 2, dtype: str
Education -> <StringArray>
['Master', 'High School', 'PhD', 'Bachelor']
Length: 4, dtype: str
Payment_History -> <StringArray>
['Bad', 'Good', 'Average']
Length: 3, dtype: str
Employment_Status -> <StringArray>
['Unemployed', 'Employed', 'Self-Employed']
Length: 3, dtype: str
Residence_Type -> <StringArray>
['Rented', 'Mortgaged', 'Owned']
Length: 3, dtype: str
Marital_Status -> <StringArray>
['Single', 'Married', 'Divorced']
Length: 3, dtype: str


Dataset contains 12,000 rows and 14 columns with no missing values.
Target variable Creditworthiness is imbalanced: about 70% of observations belong to class 1 and 30% to class 0.
The data includes 8 numerical features and 6 categorical features (gender, education, payment history, employment status, residence type, marital status), which will require appropriate encoding before modeling.

In [10]:
target_col = "Creditworthiness"
feature_cols = [col for col in df.columns if col != target_col]

X = df[feature_cols]
y = df[target_col]

X.head(), y.head()

(   Age  Gender    Education  Income   Debt  Credit_Score  Loan_Amount  \
 0   56  Female       Master  149406  34089           581        49200   
 1   69  Female  High School   78896   8626           648        20147   
 2   46  Female       Master  119339  46281           329        41307   
 3   32    Male  High School  131067  29403           816        19019   
 4   60    Male          PhD   38001  30032           673        16317   
 
    Loan_Term  Num_Credit_Cards Payment_History Employment_Status  \
 0         60                 4             Bad        Unemployed   
 1         24                 7            Good          Employed   
 2         12                 8             Bad        Unemployed   
 3         60                 8             Bad          Employed   
 4         36                 4         Average          Employed   
 
   Residence_Type Marital_Status  
 0         Rented         Single  
 1      Mortgaged        Married  
 2          Owned         Single 

In [11]:
numeric_features = ["Age", "Income", "Debt", "Credit_Score",
                    "Loan_Amount", "Loan_Term", "Num_Credit_Cards"]
categorical_features = ["Gender", "Education", "Payment_History",
                        "Employment_Status", "Residence_Type", "Marital_Status"]


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((9600, 13), (2400, 13))

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


clf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        ))
    ]
)


In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


Accuracy: 0.51875
ROC-AUC: 0.519172556329768
              precision    recall  f1-score   support

           0       0.31      0.50      0.38       714
           1       0.71      0.53      0.61      1686

    accuracy                           0.52      2400
   macro avg       0.51      0.51      0.49      2400
weighted avg       0.59      0.52      0.54      2400



Baseline conclusions
Logistic regression with standard preprocessing (scaling + one-hot encoding) achieves around 0.70 accuracy but ROC-AUC is close to 0.5.
After adding class_weight="balanced", the model becomes more sensitive to the minority class: it starts detecting non-creditworthy customers (class 0) with recall around 0.5, at the cost of overall accuracy dropping to about 0.52.
This makes the model more realistic for credit risk use cases, but overall discrimination power remains low, so more expressive models and/or better feature engineering are needed.