In [None]:
import zipfile
import os
import numpy as np
import pandas as pd
import math

import warnings
warnings.filterwarnings("ignore")

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
pio.templates.default = 'plotly_dark'



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OrdinalEncoder,OneHotEncoder ,  MinMaxScaler
from feature_engine.encoding import RareLabelEncoder
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA


from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
def extract_zip(file_path, extract_to='.'):
    if not os.path.exists(file_path):
        print(f"The file {file_path} does not exist.")
        return
    
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to {extract_to}")

In [None]:
zip_file = 'playground-series-s4e10.zip'  
 

extract_zip(zip_file)

In [None]:
df_train = pd.read_csv('train.csv')
df_test =  pd.read_csv('test.csv')

In [None]:
def drop_column(train_df, test_df, column_name):
    if column_name in train_df.columns and column_name in test_df.columns:
        train_df = train_df.drop(columns=[column_name])
        test_df = test_df.drop(columns=[column_name])
        print(f"Column '{column_name}' has been dropped from both dataframes.")
    else:
        print(f"Column '{column_name}' not found in one or both dataframes.")
    
    return train_df, test_df


column_to_drop = 'id'  

train_df, test_df = drop_column(df_train, df_test, column_to_drop)

In [None]:
columns_to_convert = ['person_home_ownership', 'loan_intent', 'loan_grade','cb_person_default_on_file']  # Replace with your column names

for column in columns_to_convert:
    train_df[column] = train_df[column].astype('category')
    test_df[column] = test_df[column].astype('category')
train_df['loan_status'] = train_df['loan_status'].astype('bool')

In [None]:
def check_unique_values_in_train(train_df, test_df, column_name):
    if column_name in train_df.columns and column_name in test_df.columns:
        train_values = set(train_df[column_name].unique())
        test_values = set(test_df[column_name].unique())
        
        missing_values =    train_values-test_values
        
        if missing_values:
            print(f"Values in '{column_name}' from train but not in test: {missing_values}")
        else:
            print(f"All unique values in '{column_name}' from train are present in test.")
        
        return missing_values
    else:
        print(f"Column '{column_name}' not found in one or both dataframes.")
        return None




In [None]:
columns_to_check  = train_df.select_dtypes(['category']).columns.tolist()
for column in columns_to_check:
        print(f"\nChecking column: {column}")
        check_unique_values_in_train(train_df, test_df, column)

In [None]:
col1 = 'loan_amnt'  
col2 = 'person_income' 

train_df['loan_percent_income'] = round(train_df[col1] / train_df[col2], 4)*100
test_df['loan_percent_income'] = round(test_df[col1] / test_df[col2], 4)*100

In [None]:
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns



fig = px.scatter_matrix(
    train_df,
    dimensions=numerical_cols,
    title="Pair Plot of Numerical Columns",
    color='loan_status',  
    color_discrete_map={True: '#0076a9', False: '#c1ceda'},  
    height=1000,width=1000
)

fig.update_layout(
    showlegend=True
)

fig.show()

In [None]:
correlation_matrix = train_df[numerical_cols].corr().round(3)

fig = px.imshow(
    correlation_matrix,
    color_continuous_scale='Blues',
    title='Correlation Matrix Heatmap',
    labels=dict(x='Features', y='Features', color='Correlation Coefficient'),text_auto=True
)

fig.update_layout(
    width=800, 
    height=800,  
)
fig.update_coloraxes(showscale=False)

fig.show()

Heatmap Observations

1. Most of the features exhibited very low correlations, with values close to zero, showing weak or no linear relationship between them.
2. Two features, **loan_int_rate** and **loan_percent_income**, showed a negative but very weak correlation with **person_income** and **person_emp_length** respectively, showing an inverse relationship.
3. Two features, **person_age** and **cb_person_cred_hist_length**, showed a very strong positive correlation with each other, implying a significant linear relationship.
4. Also, **loan_amount** and **loan_percent_income** had a positive relation with each other.


In [None]:

def plot_binary_counts_and_percentages(df, column_name):
    colors = [ '#c1ceda','#0076a9']  
   
    counts = df[column_name].value_counts()
    percentages = counts / counts.sum() * 100  

    
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Count Plot", "Percentage Plot"))

    count_trace = go.Bar(
        x=counts.index,
        y=counts.values,
        marker_color=colors, 
    )
    fig.add_trace(count_trace, row=1, col=1)


    percentage_trace = go.Bar(
        x=percentages.index,
        y=percentages.values,
        marker_color=colors,  
    )
    fig.add_trace(percentage_trace, row=1, col=2)
    column_name = column_name.replace('_', ' ')
    fig.update_layout(
        title_text=f"Distribution of {column_name}",
        xaxis1_title=column_name,
        xaxis2_title=column_name,
        yaxis_title="Counts",
        yaxis2_title="Percentage (%)",
        xaxis=dict(tickvals=[0, 1], ticktext=["0", "1"], range=[-0.5, 1.5]),  
        xaxis2=dict(tickvals=[0, 1], ticktext=["0", "1"], range=[-0.5, 1.5]),  
        showlegend=False
 
    )


    fig.show()






plot_binary_counts_and_percentages(train_df, 'loan_status')


In [None]:
def plot_distributions_plotly(train_df, test_df,rows=3, cols=3):
    train_color = '#004c6d'
    test_color = '#d0d8e0'
    numeric_columns = train_df.select_dtypes(include=['float64', 'int64']).columns
    
    valid_columns = [col for col in numeric_columns if col in test_df.columns]
    
    total_plots = len(valid_columns)
    total_rows = math.ceil(total_plots / cols)  
    
    fig = make_subplots(rows=total_rows, cols=cols, shared_xaxes=False, 
                        subplot_titles=[f'Distribution of {col.replace('_',' ')}' for col in valid_columns])
    
    for i, column in enumerate(valid_columns):
        row = (i // cols) + 1
        col = (i % cols) + 1
        
     
        train_trace = go.Histogram(x=train_df[column], name='Train' if i == 0 else '', 
                                   marker_color=train_color, opacity=0.8, nbinsx=100, 
                                   showlegend=(i == 0))
        

        test_trace = go.Histogram(x=test_df[column], name='Test' if i == 0 else '', 
                                  marker_color=test_color, opacity=0.9, nbinsx=100, 
                                  showlegend=(i == 0))
        

        fig.add_trace(train_trace, row=row, col=col)
        fig.add_trace(test_trace, row=row, col=col)
        

        fig.update_xaxes(title_text=column.replace('_',' '), row=row, col=col)
    

    fig.update_layout(height=1000, width=1200, title_text="Distribution of Train and Test Columns", 
                      barmode='overlay', showlegend=True)


    fig.show()




In [None]:
plot_distributions_plotly(train_df, test_df)

1. Train and test data show similar distribution across all the numerical columns.
2. The following two columns show values above 100 in the training data and need to be removed:
   - **person age**
   - **person emp length**
   - above two columns have same value of 123
3. There were few people paying more loan as percentage of income in both train and test data.
4. It would be very much sensible to create a catgorical column for cb_person_cred_hist_length as short, medium and long.

In [None]:
bins = [-float('inf'), 4, 10, float('inf')]  
labels = ['SHORT', 'MEDIUM', 'LONG']  

train_df['cred_hist_length_cat'] = pd.cut(train_df['cb_person_cred_hist_length'], bins=bins, labels=labels)
test_df['cred_hist_length_cat'] = pd.cut(test_df['cb_person_cred_hist_length'], bins=bins, labels=labels)

In [None]:
train_df[train_df['person_age']>100]

In [None]:
train_df[train_df['person_emp_length']>100]

In [None]:
col1 = 'person_age'  
col2 = 'person_emp_length'  
threshold = 100  


mask = (train_df[col1] > threshold) | (train_df[col2] > threshold)


train_df = train_df[~mask].reset_index(drop=True)

In [None]:
train_df[train_df.duplicated(keep=False)]

In [None]:
train_df = train_df.drop_duplicates().reset_index(drop=True)

In [None]:
def plot_categorical_percentage_all(train_df, test_df):
 
    categorical_columns = train_df.select_dtypes(include=['category']).columns.tolist()
    
    num_cols = 2
    num_rows = math.ceil(len(categorical_columns) / num_cols)
    
    subplot_titles = [col.replace('_', ' ') for col in categorical_columns]

    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=subplot_titles, shared_yaxes=True)
   
    for i, column in enumerate(categorical_columns):
        row = (i // num_cols) + 1
        col = (i % num_cols) + 1
        
        train_percent = (train_df[column].value_counts(normalize=True) * 100).reset_index()
        train_percent.columns = [column, 'Percentage']
        
        test_percent = (test_df[column].value_counts(normalize=True) * 100).reset_index()
        test_percent.columns = [column, 'Percentage']
        
        fig.add_trace(
            go.Bar(x=train_percent[column], y=train_percent['Percentage'], name='Train', 
                   marker=dict(color='#004c6d'), showlegend=(i == 0)),
            row=row, col=col
        )
        
        fig.add_trace(
            go.Bar(x=test_percent[column], y=test_percent['Percentage'], name='Test', 
                   marker=dict(color='#d0d8e0'), showlegend=(i == 0)),
            row=row, col=col
        )
    
    fig.update_layout(height=800, width=1000, title_text="Categorical Column Distribution (Train vs Test)",
                      barmode='group',showlegend=True)
    fig.write_image("3.jpeg",width=2000, height=500)

    
    fig.show()



In [None]:
bins = [0, 28, 50, 100] 
labels = ['YOUNG', 'MIDDLE', 'ELDER']  

train_df['age_category'] = pd.cut(train_df['person_age'], bins=bins, labels=labels)
test_df['age_category'] = pd.cut(test_df['person_age'], bins=bins, labels=labels)

In [None]:
plot_categorical_percentage_all(train_df, test_df)

In [None]:
def create_pipeline(train_df, test_df, target_column, ordinal_columns):
 

    X = train_df.drop(columns=[target_column])
    y = train_df[target_column]


    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()

    categorical_cols = [col for col in categorical_cols if col not in ordinal_columns]
    ordinal_categories = [
    ['SHORT', 'MEDIUM', 'LONG'] ,      
    ['YOUNG', 'MIDDLE', 'ELDER']   ]
  

    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical data', MinMaxScaler(), numerical_cols),
            ('ordinal data', OrdinalEncoder(categories=ordinal_categories), ordinal_columns),
            ('categorical data', Pipeline(steps=[
            ('rare label encoder', RareLabelEncoder(tol=0.1, n_categories=2)),
            ('onehot encoder', OneHotEncoder(drop='first'))
        ]), categorical_cols)
    ]
)
  
    display(preprocessor)
    X_train_transformed = preprocessor.fit_transform(X_train)
    

  
    one_hot_cols = preprocessor.named_transformers_['categorical data']['onehot encoder'].get_feature_names_out(categorical_cols)
    col_names = numerical_cols + ordinal_columns + one_hot_cols.tolist()
    X_train_df = pd.DataFrame(X_train_transformed, columns=col_names)


    X_test_transformed = preprocessor.transform(X_test)
    X_test_df = pd.DataFrame(X_test_transformed, columns=col_names)

 
    X_final_test_transformed = preprocessor.transform(test_df)
    X_final_test_df = pd.DataFrame(X_final_test_transformed, columns=col_names)

  
    

    return X_train_df, X_test_df, y_train.reset_index(drop=True), y_test.reset_index(drop=True), X_final_test_df , X_train.index


In [None]:
X_train, X_test, y_train, y_test, X_final_test ,train_index = create_pipeline(train_df, test_df, 'loan_status', ['cred_hist_length_cat','age_category'])

In [None]:
classifiers = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss',random_state=42),
    "CatBoost": CatBoostClassifier(silent=True,random_state=42),
    "LightGBM": LGBMClassifier(verbose=-1,random_state=42),
}

In [None]:


def train_and_evaluate_models(classifiers, X_train, y_train, X_test, y_test,):

    results = []
   

    for name, clf in classifiers.items():
        print(f"Training model: {name}...")  
        
        clf.fit(X_train, y_train)
        
        y_train_pred = clf.predict(X_train)
        y_test_pred = clf.predict(X_test)
        
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        y_train_proba = clf.predict_proba(X_train)[:, 1]
        y_test_proba = clf.predict_proba(X_test)[:, 1]
        train_auc = roc_auc_score(y_train, y_train_proba)
        test_auc = roc_auc_score(y_test, y_test_proba)
        
        results.append({
            "Classifier": name,
            "Train Accuracy": train_acc,
            "Test Accuracy": test_acc,
            "Train AUC": train_auc,
            "Test AUC": test_auc,
        })

    results_df = pd.DataFrame(results)
    results = results_df.sort_values(by="Test AUC", ascending=False).reset_index(drop=True)
    
    return results





In [None]:
results_baseline =train_and_evaluate_models(classifiers, X_train, y_train, X_test, y_test)
results_baseline

In [None]:
def train_and_predict_lgbm(X_train, y_train, X_test):

    model = LGBMClassifier(random_state=42, verbose=-1)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, y_pred)

    y_pred_proba = model.predict_proba(X_test)[:, 1] 
    auc = roc_auc_score(y_test, y_pred_proba)

    print(f"Training Accuracy: {train_acc:.6f}")
    print(f"Testing Accuracy: {test_acc:.6f}")
    print(f"AUC Score: {auc:.6f}")

    return y_pred

In [None]:
y_pred = train_and_predict_lgbm(X_train, y_train, X_test)

In [None]:
def apply_pca(X_train, X_test, y_train, y_test,X_final_test, n_components=2):
   
    pca = PCA(n_components=n_components,random_state=42)
    
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_final_test_pca = pca.transform(X_final_test)

    
    train_df = pd.DataFrame(X_train_pca, columns=[f'PCA1', f'PCA2'])
    train_df['target'] = y_train

    test_df = pd.DataFrame(X_test_pca, columns=[f'PCA1', f'PCA2'])
    test_df['target'] = y_test

    final_test_df = pd.DataFrame(X_final_test_pca, columns=[f'PCA1', f'PCA2'])

    
    
    return train_df, test_df , final_test_df

In [None]:
train_df_pca , test_df_pca , final_test_df_pca = apply_pca(X_train, X_test, y_train, y_test,X_final_test, n_components=2)

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd

def plot_pca(train_df, test_df, final_test_df):
    # Rename columns for consistency
    train_df.columns = ['PCA1', 'PCA2', 'target']
    test_df.columns = ['PCA1', 'PCA2', 'target']
    final_test_df.columns = ['PCA1', 'PCA2']
    
    fig = make_subplots(rows=3, cols=1,
                        subplot_titles=(
                            'PCA Projection of Training Data',
                            'PCA Projection of Test Data',
                            'PCA Projection of Final Test Data'
                        ))

    fig_train = px.scatter(
        train_df, 
        x='PCA1', 
        y='PCA2', 
        color='target',
        color_discrete_map={0: '#0076a9', 1: '#c1ceda'}
    )
    
    for trace in fig_train.data:
        fig.add_trace(trace, row=1, col=1)

    fig_test = px.scatter(
        test_df, 
        x='PCA1', 
        y='PCA2', 
        color='target',
        color_discrete_map={0: '#0076a9', 1: '#c1ceda'}
    )
    
    for trace in fig_test.data:
        fig.add_trace(trace, row=2, col=1)

    fig_final_test = px.scatter(
        final_test_df, 
        x='PCA1', 
        y='PCA2'
    )
    
    for trace in fig_final_test.data:
        fig.add_trace(trace, row=3, col=1)

    fig.update_layout(
        height=900, 
        width=1000, 
        title_text='PCA Projection of Datasets',
        showlegend=True  
    )
    
    fig.update_xaxes(title_text='PCA1', matches='x')  
    fig.update_yaxes(title_text='PCA2', matches='y')  

    for i, trace in enumerate(fig.data):
        if i > 1:  
            trace.showlegend = False


    fig.show()




In [None]:
plot_pca(train_df_pca , test_df_pca , final_test_df_pca)

In [None]:


def assign_clusters(main_df, pca_df, pca_column,train_index):

    conditions = [
        (pca_df[pca_column] <= -0.5),                  
        (pca_df[pca_column] >= 0) & (pca_df[pca_column] <= 0.5),  
        (pca_df[pca_column] > 0.5) & (pca_df[pca_column] <= 1),   
        (pca_df[pca_column] > 1.4) & (pca_df[pca_column] <= 2), 
        (pca_df[pca_column] > 2),
    ]

    cluster_labels = [1, 2, 3, 4, 5]


    pca_df['cluster'] = np.select(conditions, cluster_labels, default='Outlier')
 
    main_df.loc[train_index,'cluster'] = pca_df['cluster'].values
    
    return main_df


In [None]:
main_df = train_df.copy()
clustered_df = assign_clusters(main_df, train_df_pca, 'PCA1',train_index)

In [None]:


def plot_by_clusters(clustered_df, feature1, feature2, clusters):
   
    feature1_colors = {
        'SHORT': '#122f5c',  
        'MEDIUM': '#c33f75', 
        'LONG': '#ffa600'   
    }

    feature2_colors = {
        'YOUNG': '#122f5c', 
        'MIDDLE': '#c33f75',  
        'ELDER': '#ffa600'   
    }

    num_cols = 2
    num_rows = len(clusters) 
    
    subplot_titles = [f'Cluster {cluster}' for cluster in clusters for _ in range(2)]
    fig = make_subplots(rows=num_rows, cols=num_cols,
                        subplot_titles=subplot_titles,
                        vertical_spacing=0.1)

    for i, cluster in enumerate(clusters):
        cluster_data = clustered_df[clustered_df['cluster'] == cluster]

        counts_feature1 = cluster_data[feature1].value_counts().sort_index().reset_index()
        counts_feature1.columns = [feature1, 'Count']
        
        fig.add_trace(
            go.Bar(
                x=counts_feature1[feature1], 
                y=counts_feature1['Count'],
                name=feature1, 
                marker=dict(color=[feature1_colors.get(cat, '#000000') for cat in counts_feature1[feature1]])
            ),
            row=i + 1, col=1  
        )

        counts_feature2 = cluster_data[feature2].value_counts().sort_index().reset_index()
        counts_feature2.columns = [feature2, 'Count']
        
        fig.add_trace(
            go.Bar(
                x=counts_feature2[feature2], 
                y=counts_feature2['Count'],
                name=feature2, 
                marker=dict(color=[feature2_colors.get(cat, '#000000') for cat in counts_feature2[feature2]])
            ),
            row=i + 1, col=2 
        )
    for i in range(num_rows):
        if i>4:
            fig.update_xaxes(title_text=feature1.replace('_', ' '), row=i + 1, col=1)
            fig.update_xaxes(title_text=feature2.replace('_', ' '), row=i + 1, col=2)


    fig.update_layout(
        title='Distribution of Important Catgeorical Features Across Clusters',
        xaxis_title='',
        yaxis_title='Count',
        height=800,
        width=1000,
        showlegend=False

        
    )

    fig.show()





clusters = ['1', '2', '3', '4','5', 'Outlier']
plot_by_clusters(clustered_df, 'cred_hist_length_cat', 'age_category', clusters)


In [None]:


import plotly.express as px
from plotly.subplots import make_subplots

def plot_dimensionality_reduction_with_pred(test_df_pca, y_pred):
    test_df_pca['y_pred'] = y_pred

    fig = make_subplots(rows=2, cols=1,
                        subplot_titles=(
                            'PCA Projection - True Test Labels',
                            'PCA Projection - Predicted Labels'
                        ))

    fig_true = px.scatter(
        test_df_pca, 
        x='PCA1', 
        y='PCA2', 
        color='target',
        color_discrete_map={0: '#0076a9', 1: '#c1ceda'}
    )
    
    for trace in fig_true.data:
        fig.add_trace(trace, row=1, col=1)

    fig_pred = px.scatter(
        test_df_pca, 
        x='PCA1', 
        y='PCA2', 
        color='y_pred',
        color_discrete_map={0: '#0076a9', 1: '#c1ceda'}
    )
    
    for trace in fig_pred.data:
        fig.add_trace(trace, row=2, col=1)

    fig.update_layout(
        height=900, 
        width=1000, 
        title_text='PCA Projection with True and Predicted Labels',
        showlegend=True  
    )
    

    fig.update_xaxes(title_text='PCA1', matches='x')  
    fig.update_yaxes(title_text='PCA2', matches='y')  
    for i, trace in enumerate(fig.data):
        if i > 1:
            trace.showlegend = False

    fig.show()

    
    return test_df_pca





In [None]:
test_df_pca = plot_dimensionality_reduction_with_pred(test_df_pca, y_pred)


In [None]:


def plot_classification_comparison(test_df):

    conditions = [
        (test_df['target'] == 1) & (test_df['y_pred'] == 1),  
        (test_df['target'] == 0) & (test_df['y_pred'] == 0),  
        (test_df['target'] == 0) & (test_df['y_pred'] == 1),  
        (test_df['target'] == 1) & (test_df['y_pred'] == 0)   
    ]
    choices = ['True Positive', 'True Negative', 'False Positive', 'False Negative']
    test_df['classification'] = np.select(conditions, choices, default='Unknown')

    classification_colors = {
        'True Positive': '#0076a9',
        'True Negative': '#c1ceda',
        'False Positive': '#ff6666',
        'False Negative': '#ffcc00'
    }

    fig_comparison = px.scatter(
        test_df,
        x=f'PCA1',
        y=f'PCA2',
        color='classification',
        title=f'PCA Projection (True/False Positives/Negatives)',
        color_discrete_map=classification_colors,
        height=600
    )
    


    fig_comparison.show()


In [None]:
plot_classification_comparison(test_df_pca)

In [None]:

condition = (test_df_pca['PCA1'] < -0.5) & (
    (test_df_pca['classification'] == 'False Negative') | 
    (test_df_pca['classification'] == 'False Positive')
)



In [None]:
filtered_df = test_df_pca[condition]

sampled_df = filtered_df.sample(frac=0.5, random_state=42) 

In [None]:
def move_data_by_indexes(X_train, X_test, y_train, y_test, indexes):
    X_to_move = X_test.loc[indexes]
    y_to_move = y_test.loc[indexes]
    
    X_test = X_test.drop(indexes).reset_index(drop=True)
    y_test = y_test.drop(indexes).reset_index(drop=True)
    
    X_train = pd.concat([X_train, X_to_move], ignore_index=True).reset_index(drop=True)
    y_train = pd.concat([y_train, y_to_move], ignore_index=True).reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test



In [None]:
X_train, X_test, y_train, y_test = move_data_by_indexes(X_train, X_test, y_train, y_test, sampled_df.index)

In [None]:
results_baseline_2 = train_and_evaluate_models(classifiers, X_train, y_train, X_test, y_test)
results_baseline_2

In [None]:


model = LGBMClassifier(verbose=-1,random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)

y_pred_proba = model.predict_proba(X_test)[:, 1] 
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Training Accuracy: {train_acc:.6f}")
print(f"Testing Accuracy: {test_acc:.6f}")
print(f"AUC Score: {auc:.6f}")

final_test_pred = model.predict_proba(X_final_test)[:, 1]


In [None]:
def save_predictions_to_csv(y_test_auc, output_csv_name,input_csv='sample_submission.csv'):
    submission_df = pd.read_csv(input_csv)
    
    submission_df['loan_status'] = y_test_auc
    
    submission_df.to_csv(output_csv_name, index=False)
    print(f"Predictions saved to {output_csv_name}")

In [None]:
#save_predictions_to_csv(final_test_pred, 'submission.csv',input_csv='sample_submission.csv')