In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler,SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:

def get_effective_features():
    df = pd.read_csv('../data/application_data.csv')
    null_counts = df.isnull().sum()

    columns_with_more_than_500_null = null_counts[null_counts > 200].index

    # Dropping columns with more than 500 null values
    df = df.drop(columns_with_more_than_500_null, axis=1)

    # if the column is object type, fill it with mode else fill it with median
    for i in df.columns:
        if df[i].dtypes == 'object':
            df[i].fillna(df[i].mode()[0], inplace=True)
        else:
            df[i].fillna(df[i].median(), inplace=True)


    categorical = df.select_dtypes('object').columns
    label_encoder = preprocessing.LabelEncoder()
    for name in df[categorical].columns:
        df[name]= label_encoder.fit_transform(df[name])

    correlation_matrix = df.corr()
    correlation_with_target = correlation_matrix['TARGET'].abs().sort_values(ascending=False)

    top_25_correlated_columns = correlation_with_target[:25].index


    df_normalized = df[top_25_correlated_columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return df_normalized


In [3]:
df1 = get_effective_features()

In [4]:
X = df1.drop(['TARGET'],axis = 1)
target = df1['TARGET']
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size= 0.1, random_state = 42)

In [5]:
# print('before Oversampling:',Counter(Y_train))
# oversample = RandomOverSampler(sampling_strategy='minority')
# X_train1, Y_train1 = oversample.fit_resample(X_train, Y_train)
# print('After Oversampling:',Counter(Y_train1))

In [6]:
# from imblearn.over_sampling import SMOTE
# from collections import Counter

# print('Before Oversampling:', Counter(Y_train))

# # Create a SMOTE object
# smote = SMOTE(sampling_strategy='minority')

# # Apply SMOTE to the training data
# X_train1, Y_train1 = smote.fit_resample(X_train, Y_train)

# print('After Oversampling:', Counter(Y_train1))


In [7]:
# Use SMOTEEN
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

print('Before Oversampling:', Counter(Y_train))

smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'))
X_train1, Y_train1 = smote_enn.fit_resample(X_train, Y_train)

Before Oversampling: Counter({0.0: 254453, 1.0: 22306})


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

model_rf = RandomForestClassifier()
model_rf.fit(X_train1, Y_train1)




In [13]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

proba = model_rf.predict_proba(X_test)[:, 1]

threshold = 0.5

Y_pred = (proba > threshold).astype(int)

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print("Accuracy: {:.2f}%, Precision: {:.2f}%, Recall: {:.2f}%,f1: {:.2f}%".format(
    accuracy * 100, precision * 100, recall * 100,f1 *100))


# In confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy: 79.85%, Precision: 15.42%, Recall: 32.55%,f1: 20.92%
Confusion Matrix:
[[23734  4499]
 [ 1699   820]]


In [10]:
# In ra 10 thuộc tính quan trọng nhất của mô hình Random Forest
feature_importances = model_rf.feature_importances_

# Sắp xếp theo độ quan trọng giảm dần
sorted_idx = feature_importances.argsort()[::-1]

# Chọn ra 10 thuộc tính quan trọng nhất
top_features = X.columns[sorted_idx][:10]

# In ra kết quả
print("Top 10 Important Features:")
for i, feature in enumerate(top_features, 1):
    print(f"{i}. {feature}: {feature_importances[sorted_idx[i-1]]:.4f}")


Top 10 Important Features:
1. HOUR_APPR_PROCESS_START: 0.1089
2. AMT_CREDIT: 0.1050
3. REGION_POPULATION_RELATIVE: 0.1047
4. DAYS_LAST_PHONE_CHANGE: 0.0993
5. DAYS_BIRTH: 0.0944
6. DAYS_ID_PUBLISH: 0.0793
7. DAYS_REGISTRATION: 0.0774
8. DAYS_EMPLOYED: 0.0688
9. ORGANIZATION_TYPE: 0.0598
10. NAME_EDUCATION_TYPE: 0.0405
