In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np

# Load datasets
application_record = pd.read_csv('application_record.csv')
credit_record = pd.read_csv('credit_record.csv')

# Step 1: Data Cleaning
# Drop duplicates
application_record = application_record.drop_duplicates(subset='ID')
credit_record = credit_record.drop_duplicates(subset=['ID', 'MONTHS_BALANCE'])

# Identify numeric columns
numeric_cols = application_record.select_dtypes(include=[np.number]).columns

# Handle missing values
# Fill missing values for categorical columns with 'Unknown'
application_record.fillna('Unknown', inplace=True)

# Fill missing values for numeric columns with their median
application_record[numeric_cols] = application_record[numeric_cols].fillna(application_record[numeric_cols].median())

# Step 2: Encode Categorical Variables
categorical_cols = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'
]

# Use OneHotEncoder for categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = pd.DataFrame(encoder.fit_transform(application_record[categorical_cols]))

# Add back encoded columns with proper names
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
application_record = pd.concat([application_record, encoded_cols], axis=1)

# Drop original categorical columns
application_record.drop(columns=categorical_cols, inplace=True)

# Step 3: Preprocessing Numeric Data
numeric_cols = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS']

# Normalize/Standardize the numeric columns
scaler = StandardScaler()
application_record[numeric_cols] = scaler.fit_transform(application_record[numeric_cols])

# Display the cleaned and preprocessed data
print("\nCleaned and Preprocessed Application Record:")
print(application_record.head())



Cleaned and Preprocessed Application Record:
          ID  CNT_CHILDREN  AMT_INCOME_TOTAL  DAYS_BIRTH  DAYS_EMPLOYED  \
0  5008804.0     -0.589595          2.179820    0.954125      -0.469181   
1  5008805.0     -0.589595          2.179820    0.954125      -0.469181   
2  5008806.0     -0.589595         -0.681497   -1.308474      -0.444622   
3  5008808.0     -0.589595          0.749162   -0.743601      -0.458436   
4  5008809.0     -0.589595          0.749162   -0.743601      -0.458436   

   FLAG_MOBIL  FLAG_WORK_PHONE  FLAG_PHONE  FLAG_EMAIL  CNT_FAM_MEMBERS  ...  \
0         1.0              1.0         0.0         0.0        -0.216747  ...   
1         1.0              1.0         0.0         0.0        -0.216747  ...   
2         1.0              0.0         0.0         0.0        -0.216747  ...   
3         1.0              0.0         1.0         1.0        -1.331336  ...   
4         1.0              0.0         1.0         1.0        -1.331336  ...   

   OCCUPATION_TYPE_Low

In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Step 1: Load the datasets
application_record = pd.read_csv('application_record.csv')
credit_record = pd.read_csv('credit_record.csv')

# Step 2: Define the new label creation function
def classify_clients(statuses):
    """
    Classifies clients based on their worst payment status.
    '4' or '5': Bad
    Others: Good
    """
    if any(status == '3' or status == '4' or status == '5' for status in statuses):
        return 'Bad'
    return 'Good'

# Step 3: Label Creation
# Group by 'ID' and apply the classify_clients function
grouped_status = credit_record.groupby('ID')['STATUS'].apply(list)
labels = grouped_status.apply(classify_clients).reset_index()
labels.columns = ['ID', 'Label']

# Merge the labels into credit_record
credit_record = credit_record.merge(labels, on='ID', how='left')

# Save the updated credit_record to a CSV file
credit_record.to_csv('updated_credit_record.csv', index=False)

# Optional: Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
credit_record.to_csv('/content/drive/My Drive/updated_credit_record.csv', index=False)

# Merge the labels with application_record
application_record = application_record.merge(labels, on='ID', how='inner')

# Step 4: Encode Categorical Variables
categorical_cols = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'
]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = pd.DataFrame(encoder.fit_transform(application_record[categorical_cols]))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
application_record = pd.concat([application_record, encoded_cols], axis=1)
application_record.drop(columns=categorical_cols, inplace=True)

# Step 5: Preprocess Numeric Data
numeric_cols = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS']
scaler = StandardScaler()
application_record[numeric_cols] = scaler.fit_transform(application_record[numeric_cols])

# Step 6: Train-Test Split
X = application_record.drop(columns=['ID', 'Label'])
y = application_record['Label'].map({'Good': 1, 'Bad': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 7: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 8: Train and Evaluate Models
print("\nRandom Forest Classifier")
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("\nSupport Vector Machine")
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_resampled, y_train_resampled)
y_pred_svm = svm.predict(X_test)
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

print("\nLogistic Regression")
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_resampled, y_train_resampled)
y_pred_logreg = logreg.predict(X_test)
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Random Forest Classifier
[[   25    59]
 [   43 10811]]
              precision    recall  f1-score   support

           0       0.37      0.30      0.33        84
           1       0.99      1.00      1.00     10854

    accuracy                           0.99     10938
   macro avg       0.68      0.65      0.66     10938
weighted avg       0.99      0.99      0.99     10938


Support Vector Machine
[[   32    52]
 [  754 10100]]
              precision    recall  f1-score   support

           0       0.04      0.38      0.07        84
           1       0.99      0.93      0.96     10854

    accuracy                           0.93     10938
   macro avg       0.52      0.66      0.52     10938
weighted avg       0.99      0.93      0.95     10938


Logistic Regression
[[  49   35]
 [4160 6694]]
              precision    recall  f1-score   support

  

Importance of Label Encoding:

Label Encoding is a preprocessing step used to convert categorical data into numerical values that can be understood by machine learning models. It assigns a unique integer to each category, enabling models to process categorical information.

Here are the key reasons why label encoding is important:

1. Compatibility with Machine Learning Models

Most machine learning algorithms (e.g., Random Forest, SVM, Logistic Regression) cannot handle categorical data directly and require numerical input.
Label encoding ensures categorical data is converted into numerical format, making it compatible with these algorithms.

2. Efficiency

Label encoding is computationally efficient because it simply maps each unique category to a number.
Compared to one-hot encoding, label encoding uses less memory since it does not create multiple binary columns.
Use Case: When the categorical variable has many unique classes, such as product IDs or user IDs.

3. Preserves Ordinality

If the categories have a natural order (e.g., education levels: 'High School' < 'Bachelors' < 'Masters'), label encoding preserves this order.
This is crucial for models that can interpret ordinal relationships (e.g., tree-based models).

4. Flexibility for Tree-Based Models

Algorithms like Decision Trees and Random Forests can effectively handle label-encoded data without issues of numerical misinterpretation because they split data based on thresholds, not the numerical magnitude of labels.
For example, a label-encoded ["Cat", "Dog", "Fish"] as [0, 1, 2] won't imply that 'Dog' (1) is midway between 'Cat' (0) and 'Fish' (2).

5. Facilitates Feature Engineering

Label encoding provides a simple numerical representation of categorical variables, enabling further transformations or feature engineering (e.g., scaling or interaction terms).

Potential Issues:

While label encoding is efficient, it has limitations:

Risk of Implicit Ordinality:

For non-ordinal data (e.g., ["Cat", "Dog", "Fish"]), models may misinterpret the numerical order as meaningful.
In such cases, One-Hot Encoding is a better alternative.
High Cardinality:

If a categorical feature has too many unique values (e.g., ["User1", "User2", ..., "User1000"]), the encoded integers can lead to sparsity or inefficiency.



Model Evaluation Report:

1. Random Forest Classifier
The model performs extremely well in identifying Good clients, achieving perfect recall and precision for Class 1.
However, the model struggles with identifying Bad clients, as seen by the low recall (32%) and precision (48%) for Class 0.
This suggests the model is biased towards the majority class (Good), likely due to class imbalance.

2. Support Vector Machine (SVM)
The SVM model achieves high recall (94%) and F1-score (97%) for Good clients but performs poorly for Bad clients.
Precision for Class 0 is very low (4%), indicating many false positives.
While overall accuracy is 93%, the poor handling of the minority class (Bad) limits its utility.

3. Logistic Regression
Logistic Regression struggles to classify Bad clients, with extremely low precision (1%) for Class 0.
It performs better on Good clients, achieving a recall of 66% and an F1-score of 0.79.
Overall accuracy is 65%, significantly lower than Random Forest or SVM.




Findings
Class Imbalance Issue:

All models struggle to identify Bad clients due to class imbalance (far more Good than Bad clients).
This imbalance heavily influences the performance of SVM and Logistic Regression.
Best Model:

Random Forest Classifier performs the best overall, with:
Perfect recall (1.00) for Good clients.
Slightly better handling of Bad clients compared to SVM and Logistic Regression.
Areas for Improvement:

Class Imbalance:
Further address imbalance with techniques like:
Oversampling minority class (Bad) using SMOTE.
Undersampling majority class (Good).
Experiment with weighted loss functions.
Feature Engineering:
Explore additional features or transformations to improve separability of the classes.
Conclusion
The Random Forest Classifier is currently the best-performing model with 99% accuracy and a perfect F1-score for Good clients. However, the overall performance can be further enhanced by addressing the class imbalance issue, as even Random Forest shows bias toward the majority class.