In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mishra5001/credit-card")

print("Path to dataset files:", path)

In [None]:
app_df = pd.read_csv(path+"/application_data.csv")
app_df.head()

In [None]:
app_df.info()

In [None]:
prev_app_df = pd.read_csv(path+"/previous_application.csv")
prev_app_df.head()

In [None]:
merged_df = app_df.merge(prev_app_df, on='SK_ID_CURR', how='inner')
for col in merged_df.columns:
    if merged_df[col].dtype in ['float64', 'object']:
        if merged_df[col].dropna().apply(lambda x: str(x).replace('.', '', 1).isdigit()).all():
            merged_df[col] = merged_df[col].astype(int, errors='ignore')
merged_df.info()
merged_df.head()

In [None]:
merged_df.dtypes

In [None]:
transactions = merged_df[['SK_ID_CURR', 'SK_ID_PREV']].copy()
transactions = transactions.rename(columns={'SK_ID_PREV': 'SK_ID_RECEIVER'})

In [None]:
class UnionFind:
    def __init__(self, elements):
        self.parent = {element: element for element in elements}
        self.rank = {element: 0 for element in elements}

    def find(self, element):
        if self.parent[element] != element:
            self.parent[element] = self.find(self.parent[element]) # Path compression
        return self.parent[element]

    def union(self, element1, element2):
        root1 = self.find(element1)
        root2 = self.find(element2)

        if root1 != root2:
            if self.rank[root1] < self.rank[root2]:
                self.parent[root1] = root2
            elif self.rank[root1] > self.rank[root2]:
                self.parent[root2] = root1
            else:
                self.parent[root2] = root1
                self.rank[root1] += 1

In [None]:
unique_accounts = pd.concat([transactions['SK_ID_CURR'], transactions['SK_ID_RECEIVER']]).unique()
uf = UnionFind(unique_accounts)

for index, row in transactions.iterrows():
    uf.union(row['SK_ID_CURR'], row['SK_ID_RECEIVER'])

In [None]:
account_community_map = {account: uf.find(account) for account in unique_accounts}
merged_df['community_id'] = merged_df['SK_ID_CURR'].map(account_community_map)

In [None]:
merged_df.to_csv('merged_credit_card_data.csv', index=False)
print("merged_credit_card_data.csv has been saved.")

In [None]:
merged_df.T

In [None]:
merged_df.head()

In [None]:
merged_df.isnull().sum()

In [None]:
null_summary = (
    merged_df.isnull()
    .sum()
    .reset_index()
    .rename(columns={'index': 'Feature', 0: 'Null_Count'})
)

null_summary['Null_%'] = (null_summary['Null_Count'] / len(merged_df) * 100).round(2)
null_summary = null_summary.sort_values(by='Null_Count', ascending=False)

print(null_summary)

In [None]:
def drop_high_null_features(merged_df, threshold=0.4):
    """
    Drops all columns with more than 'threshold' proportion of null values.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame
        threshold (float): Fraction of null values above which columns are dropped (default = 0.4)
        
    Returns:
        pd.DataFrame: Cleaned DataFrame with columns removed
    """
    # Calculate the fraction of nulls per column
    null_fraction = merged_df.isnull().mean()
    
    # Columns to drop
    cols_to_drop = null_fraction[null_fraction > threshold].index
    
    print(f"🧹 Dropping {len(cols_to_drop)} columns with more than {threshold*100}% null values")
    
    # Drop and return
    df_cleaned = merged_df.drop(columns=cols_to_drop)
    return df_cleaned