# Exploratory Data Analysis (EDA)

This notebook analyzes the ingested loan datasets from Kaggle and Microsoft Loan Credit Risk.

In [None]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt

conn = sqlite3.connect('../data/loans.db')

# List all tables
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
cursor.fetchall()

## Load Kaggle Training Dataset

In [None]:
# Replace this with the exact table name shown above
df = pd.read_sql('SELECT * FROM kaggle_cs_training', conn)
df.head()

## Basic Dataset Information

In [None]:
df.info()

In [None]:
df.describe()

## Missing Value Analysis

In [None]:
df.isnull().sum()

## Univariate Analysis

In [None]:
sns.histplot(df['RevolvingUtilizationOfUnsecuredLines'], kde=True)
plt.title('Revolving Utilization Distribution')
plt.show()

In [None]:
sns.histplot(df['age'], kde=True)
plt.title('Age Distribution')
plt.show()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

## Risk Factor Analysis (Default vs Non-default)

In [None]:
sns.boxplot(x=df['SeriousDlqin2yrs'], y=df['RevolvingUtilizationOfUnsecuredLines'])
plt.title('Utilization vs Default Status')
plt.show()