In [1]:
import pandas as pd 

### Importing The Data 

In [2]:
train_data = pd.read_csv("application_train.csv")
test_data = pd.read_csv("application_test.csv")
bureau = pd.read_csv("bureau.csv")
bureau_balance = pd.read_csv("bureau_balance.csv")
credit_card_balance = pd.read_csv("credit_card_balance.csv")
installments_payments = pd.read_csv("installments_payments.csv")
pos_cash = pd.read_csv("POS_CASH_balance.csv")
previous_application = pd.read_csv("previous_application.csv")
sample_submission = pd.read_csv("sample_submission.csv")

### Data Preprocessing and Data Cleaning

In [3]:
dataframes = [train_data, test_data, bureau, bureau_balance, credit_card_balance, installments_payments, pos_cash, previous_application]  # List of your DataFrames

for idx, df in enumerate(dataframes, start=1):
    print(f"Checking missing values in DataFrame {idx}:")
    missing_values = df.isnull().sum()
    total_cells = df.size
    total_missing = missing_values.sum()
    percentage_missing = (total_missing / total_cells) * 100
    
    print("\nMissing values in the DataFrame:")
    print(missing_values)
    
    print("\nPercentage of missing values: {:.2f}%".format(percentage_missing))
    print("="*40)

Checking missing values in DataFrame 1:

Missing values in the DataFrame:
SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 122, dtype: int64

Percentage of missing values: 24.40%
Checking missing values in DataFrame 2:

Missing values in the DataFrame:
SK_ID_CURR                       0
NAME_CONTRACT_TYPE               0
CODE_GENDER                      0
FLAG_OWN_CAR                     0
FLAG_OWN_REALTY                  0
                              ... 
AMT_REQ_CREDIT_BUREAU_DAY     6049
AMT_REQ_CREDIT_BUREAU_WEEK    6049
AMT_REQ_CREDIT_BUREAU_MON     6049
AMT_REQ_CREDIT_BUREAU_QRT     6049
AMT_REQ_CREDIT_BUREAU_YEAR    6049
Length:

In [4]:
for idx, df in enumerate(dataframes, start=1):
    print(f"Handling missing values in DataFrame {idx}:")
    

    total_cells = df.size
    total_missing = df.isnull().sum().sum()
    percentage_missing = (total_missing / total_cells) * 100
    
    print("\nPercentage of missing values: {:.2f}%".format(percentage_missing))
    
    for column in df.columns:
        if df[column].dtype == 'float64':  
            mean_value = df[column].mean()
            df[column].fillna(mean_value, inplace=True)
    
    print("Missing values handled.\n" + "="*40)

Handling missing values in DataFrame 1:

Percentage of missing values: 24.40%
Missing values handled.
Handling missing values in DataFrame 2:

Percentage of missing values: 23.81%
Missing values handled.
Handling missing values in DataFrame 3:

Percentage of missing values: 13.50%
Missing values handled.
Handling missing values in DataFrame 4:

Percentage of missing values: 0.00%
Missing values handled.
Handling missing values in DataFrame 5:

Percentage of missing values: 6.65%
Missing values handled.
Handling missing values in DataFrame 6:

Percentage of missing values: 0.01%
Missing values handled.
Handling missing values in DataFrame 7:

Percentage of missing values: 0.07%
Missing values handled.
Handling missing values in DataFrame 8:

Percentage of missing values: 17.98%
Missing values handled.


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in train_data.select_dtypes(include=['object']).columns:
    train_data[column] = le.fit_transform(train_data[column])
    
print(train_data.head())

   SK_ID_CURR  TARGET  NAME_CONTRACT_TYPE  CODE_GENDER  FLAG_OWN_CAR  \
0      100002       1                   0            1             0   
1      100003       0                   0            0             0   
2      100004       0                   1            1             1   
3      100006       0                   0            0             0   
4      100007       0                   0            1             0   

   FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0                1             0          202500.0    406597.5      24700.5   
1                0             0          270000.0   1293502.5      35698.5   
2                1             0           67500.0    135000.0       6750.0   
3                1             0          135000.0    312682.5      29686.5   
4                1             0          121500.0    513000.0      21865.5   

   ...  FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  FLAG_DOCUMENT_20  \
0  ...                 0    

In [6]:
for column, dtype in train_data.dtypes.iteritems():
    print(f"{column}: {dtype}")

SK_ID_CURR: int64
TARGET: int64
NAME_CONTRACT_TYPE: int32
CODE_GENDER: int32
FLAG_OWN_CAR: int32
FLAG_OWN_REALTY: int32
CNT_CHILDREN: int64
AMT_INCOME_TOTAL: float64
AMT_CREDIT: float64
AMT_ANNUITY: float64
AMT_GOODS_PRICE: float64
NAME_TYPE_SUITE: int32
NAME_INCOME_TYPE: int32
NAME_EDUCATION_TYPE: int32
NAME_FAMILY_STATUS: int32
NAME_HOUSING_TYPE: int32
REGION_POPULATION_RELATIVE: float64
DAYS_BIRTH: int64
DAYS_EMPLOYED: int64
DAYS_REGISTRATION: float64
DAYS_ID_PUBLISH: int64
OWN_CAR_AGE: float64
FLAG_MOBIL: int64
FLAG_EMP_PHONE: int64
FLAG_WORK_PHONE: int64
FLAG_CONT_MOBILE: int64
FLAG_PHONE: int64
FLAG_EMAIL: int64
OCCUPATION_TYPE: int32
CNT_FAM_MEMBERS: float64
REGION_RATING_CLIENT: int64
REGION_RATING_CLIENT_W_CITY: int64
WEEKDAY_APPR_PROCESS_START: int32
HOUR_APPR_PROCESS_START: int64
REG_REGION_NOT_LIVE_REGION: int64
REG_REGION_NOT_WORK_REGION: int64
LIVE_REGION_NOT_WORK_REGION: int64
REG_CITY_NOT_LIVE_CITY: int64
REG_CITY_NOT_WORK_CITY: int64
LIVE_CITY_NOT_WORK_CITY: int64
ORGA

In [7]:

target_proportion = train_data['TARGET'].value_counts(normalize=True)

print(target_proportion)

0    0.919271
1    0.080729
Name: TARGET, dtype: float64


In [8]:
train_data

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,1.000000
1,100003,0,0,0,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000
2,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
4,100007,0,0,1,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,1,0,0,0,157500.0,254700.0,27558.0,...,0,0,0,0,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
307507,456252,0,0,0,0,1,0,72000.0,269550.0,12001.5,...,0,0,0,0,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
307508,456253,0,0,0,0,1,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.000000,0.000,0.000000,1.000000,0.000000,1.000000
307509,456254,1,0,0,0,1,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000


In [9]:
from imblearn.over_sampling import RandomOverSampler

X = train_data.drop(columns=['TARGET'])  
y = train_data['TARGET']  

oversampler = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversampler.fit_resample(X, y)


In [10]:
y_resampled

0         1
1         0
2         0
3         0
4         0
         ..
565367    1
565368    1
565369    1
565370    1
565371    1
Name: TARGET, Length: 565372, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,test_size=0.2, random_state=42)

In [14]:
X_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
76387,188579,0,0,1,1,0,50850.0,112068.0,11047.5,99000.0,...,0,0,0,0,0.000000,0.000,0.000000,3.000000,0.000000,4.000000
264986,406852,0,0,0,1,1,67500.0,640458.0,30942.0,517500.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,1.000000
68556,179510,0,1,1,1,2,135000.0,99504.0,10845.0,90000.0,...,0,0,0,0,0.000000,0.000,0.000000,1.000000,0.000000,4.000000
544960,230918,0,0,0,0,1,103500.0,1029681.0,33345.0,859500.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,2.000000,2.000000
514526,245977,0,1,1,1,0,270000.0,1528200.0,53248.5,1350000.0,...,0,0,0,0,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323261,331466,0,0,0,1,0,252000.0,1185120.0,46251.0,900000.0,...,0,0,0,0,0.000000,0.000,0.000000,1.000000,1.000000,2.000000
552172,350245,1,0,0,1,0,81000.0,180000.0,9000.0,180000.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000
37599,143552,0,0,0,1,1,180000.0,545040.0,43191.0,450000.0,...,0,0,0,0,0.000000,0.000,0.000000,0.000000,0.000000,1.000000
235197,372438,0,1,0,1,1,540000.0,2078802.0,105732.0,1984500.0,...,0,0,0,0,0.000000,0.000,0.000000,1.000000,0.000000,0.000000


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)  
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.68      0.68      0.68     56536
           1       0.68      0.68      0.68     56539

    accuracy                           0.68    113075
   macro avg       0.68      0.68      0.68    113075
weighted avg       0.68      0.68      0.68    113075



In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56536
           1       1.00      1.00      1.00     56539

    accuracy                           1.00    113075
   macro avg       1.00      1.00      1.00    113075
weighted avg       1.00      1.00      1.00    113075

