# Part 1: Data Preprocessing (پیش‌پردازش داده‌ها)

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

In [8]:
loan_dataset = pd.read_csv('../data/loan.csv')
loan_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [9]:
loan_dataset


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [10]:
loan_dataset.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Step 2: Handling Missing Values (مدیریت مقادیر ناموجود) and feture engineering (تولید ویژگی)

As mentioned in the problem statement, some columns contain empty values (NaN). We need to adopt an appropriate strategy to fill these missing values.

# Using **KNN Imputer** to Handle Missing Values

## 📌 What is KNN Imputer?
The **KNN Imputer** is a data imputation technique based on the **K-Nearest Neighbors (KNN)** algorithm.  
Instead of filling in missing values with the mean, median, or mode of a column, **KNN Imputer** uses the similarity between samples to estimate missing data.

It works by:
1. Finding **K most similar rows** (neighbors) to the row with the missing value.
2. Filling the missing cell with the **average (or weighted average)** of the corresponding feature values from its neighbors.

---

## 🚀 Why use KNN Imputer?
- **More accurate than mean/median imputation**  
  It takes into account relationships between features, which helps maintain natural variation in the data.
- **Useful for both numerical and categorical (encoded) data**  
  (Categorical data must be numerically encoded first.)
- **Reduces bias** compared to simple imputation methods.

---

## 🛠 How It Works
Given a dataset with missing values:
1. **Preprocess the data**
   - Encode categorical variables (One-Hot Encoding or Label Encoding).
   - Scale the data (e.g., with `StandardScaler`) so that all variables are on a similar range.
2. **Choose the number of neighbors (`n_neighbors`)**  
   - A typical range is between **3 and 10**.
3. **Fit and transform the data** using `KNNImputer` from `scikit-learn`.
4. **Use the imputed dataset** for model training or further analysis.



In [11]:
loan_dataset_EDA = loan_dataset.copy()
loan_dataset = pd.get_dummies(loan_dataset, columns=['Property_Area'], drop_first=False , dtype='int')
loan_dataset['Gender']=loan_dataset['Gender'].map({'Male':1,'Female':0})
loan_dataset['Loan_Status'] =loan_dataset['Loan_Status'].map({'Y':1,'N':0})
loan_dataset['Self_Employed'] = loan_dataset['Self_Employed'].map({'Yes':1,'No':0})
loan_dataset['Married'] = loan_dataset['Married'].map({'Yes':1,'No':0})
loan_dataset['Education'] = loan_dataset['Education'].map({'Graduate':1,'Not Graduate':0})
loan_dataset['Dependents'] = loan_dataset['Dependents'].map({'0':0,'1':1,'2':2,'3+':3})
loan_dataset.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,1.0,0.0,0.0,1,0.0,5849,0.0,,360.0,1.0,1,0,0,1
1,1.0,1.0,1.0,1,0.0,4583,1508.0,128.0,360.0,1.0,0,1,0,0
2,1.0,1.0,0.0,1,1.0,3000,0.0,66.0,360.0,1.0,1,0,0,1
3,1.0,1.0,0.0,0,0.0,2583,2358.0,120.0,360.0,1.0,1,0,0,1
4,1.0,0.0,0.0,1,0.0,6000,0.0,141.0,360.0,1.0,1,0,0,1


In [12]:
# فرض: loan_dataset دیتاست خام شماست
df = loan_dataset.copy()
# تبدیل ویژگی‌های دسته‌ای به عددی (One-Hot Encoding یا Label Encoding)
df_encoded = pd.get_dummies(df, drop_first=True)

# جدا کردن X و y (فرض بر اینکه ستون هدف Loan_Status است)
X = df_encoded.drop('Loan_Status', axis=1)
y = df_encoded['Loan_Status']

# مقادیر K که می‌خواهیم تست کنیم
k_values = range(2, 16)
mean_scores = []

# اسکیل کردن داده‌ها خیلی مهمه چون KNN به فاصله حساسه
scaler = StandardScaler()

for k in k_values:
    # مرحله 1: استفاده از KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    X_imputed = imputer.fit_transform(X)
    
    # اسکیل کردن ویژگی‌ها
    X_scaled = scaler.fit_transform(X_imputed)
    
    # مرحله 2: انتخاب یک مدل ساده برای تست (Logistic Regression)
    model = LogisticRegression(max_iter=1000)
    
    # Cross Validation و میانگین دقت
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    mean_scores.append(scores.mean())

# پیدا کردن بهترین K
best_k = k_values[np.argmax(mean_scores)]
print(f"✅ بهترین K برای KNNImputer: {best_k}")

✅ بهترین K برای KNNImputer: 12


In [13]:
df_score = pd.DataFrame({'K': list(k_values), 'Accuracy': mean_scores})

fig = px.line(df_score, x='K', y='Accuracy', markers=True,
            title=f'Best K in KNNImputer (Best = {best_k})',
            text=df_score['Accuracy'].round(3))

fig.add_scatter(
    x=[best_k],
    y=[max(mean_scores)],
    mode='markers+text',
    name='Best K',
    text=[f'Best: {best_k}'],
    textposition='top center',
    marker=dict(size=12, color='red', symbol='star')
)

fig.show()

In [14]:
imputer = KNNImputer(n_neighbors=12)  

# اعمال Imputer روی داده‌ها
df_imputed = imputer.fit_transform(loan_dataset)

# تبدیل مجدد به DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=loan_dataset.columns)
loan_dataset = df_imputed
print("\n دیتافریم بعد از KNN Imputation:")
loan_dataset


 دیتافریم بعد از KNN Imputation:


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,1.0,0.0,0.0,1.0,0.0,5849.0,0.0,154.083333,360.0,1.0,1.0,0.0,0.0,1.0
1,1.0,1.0,1.0,1.0,0.0,4583.0,1508.0,128.000000,360.0,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,3000.0,0.0,66.000000,360.0,1.0,1.0,0.0,0.0,1.0
3,1.0,1.0,0.0,0.0,0.0,2583.0,2358.0,120.000000,360.0,1.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,6000.0,0.0,141.000000,360.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,1.0,0.0,2900.0,0.0,71.000000,360.0,1.0,1.0,1.0,0.0,0.0
610,1.0,1.0,3.0,1.0,0.0,4106.0,0.0,40.000000,180.0,1.0,1.0,1.0,0.0,0.0
611,1.0,1.0,1.0,1.0,0.0,8072.0,240.0,253.000000,360.0,1.0,1.0,0.0,0.0,1.0
612,1.0,1.0,2.0,1.0,0.0,7583.0,0.0,187.000000,360.0,1.0,1.0,0.0,0.0,1.0


# 📊 Understanding **LTI** and **DTI** Ratios

## 📌 1. What is LTI (Loan-to-Income Ratio)?
The **Loan-to-Income Ratio (LTI)** is a financial metric used by banks and lenders to determine how much loan a borrower is requesting compared to their income.  
It helps assess **the affordability of the loan** and the borrower’s repayment capacity.

**Formula:**
\[
LTI = \frac{\text{Loan Amount}}{\text{Applicant's Income}}
\]

**Example:**
- Loan Amount = \$20,000  
- Applicant’s Annual Income = \$50,000  
- \[
LTI = \frac{20,000}{50,000} = 0.4 \; (40\%)
\]

**Interpretation:**
- **Low LTI** → Loan is small compared to income (Safer for lenders).  
- **High LTI** → Loan is large compared to income (Higher risk of default).

---

## 📌 2. What is DTI (Debt-to-Income Ratio)?
The **Debt-to-Income Ratio (DTI)** measures the proportion of a borrower's income that goes toward **servicing debts**.  
It’s a broader metric than LTI because it includes **all debt obligations**, not just the current loan.

**Formula:**
\[
DTI = \frac{\text{Total Monthly Debt Payments}}{\text{Total Monthly Income}}
\]

**Alternative formula in loan datasets:**
\[
DTI = \frac{\text{Loan Amount}}{\text{Applicant's Income + Co-applicant's Income}}
\]

**Example:**
- Loan Amount = \$20,000  
- Applicant's Monthly Income = \$3,000  
- Co-applicant's Monthly Income = \$1,000  
- \[
DTI = \frac{20,000}{(3,000 + 1,000) \times 12} = 0.416 \; (41.6\%)
\]

**Interpretation:**
- **Low DTI** → Borrower has manageable debt levels.  
- **High DTI** → Borrower is over-leveraged (Risk of repayment issues).

---

## 🏦 Why Lenders Use LTI & DTI
- To **assess credit risk** and repayment ability.
- To ensure borrowers do not take on debt they cannot manage.
- To comply with lending regulations and avoid high default rates.

---

## ⚠️ Key Points
- LTI focuses only on the **current loan request**, while DTI considers **all debts**.
- Lower values are generally **better** from the lender’s perspective.
- Many banks have a **maximum threshold** for LTI and DTI:
  - Example: LTI < 4, DTI < 40%.

---

✅ **Summary:**  
Both LTI and DTI are powerful indicators of a borrower's financial capacity.  
- **LTI** = Loan amount relative to borrower’s income.  
- **DTI** = Debt payments relative to total income.  


In [15]:
# جایگزین کردن مقادیر خالی (برای جلوگیری از خطا)
loan_dataset['LoanAmount'] = loan_dataset['LoanAmount'].fillna(0)
loan_dataset['Loan_Amount_Term'] = loan_dataset['Loan_Amount_Term'].fillna(360)
loan_dataset['ApplicantIncome'] = loan_dataset['ApplicantIncome'].fillna(0)
loan_dataset['CoapplicantIncome'] = loan_dataset['CoapplicantIncome'].fillna(0)

# محاسبه درآمد کل
loan_dataset['TotalIncome'] = loan_dataset['ApplicantIncome'] + loan_dataset['CoapplicantIncome']

# محاسبه DTI (قسط ماهانه / درآمد ماهانه)
loan_dataset['MonthlyInstallment'] = (loan_dataset['LoanAmount'] * 1000) / loan_dataset['Loan_Amount_Term']
loan_dataset['Debt-to-Income Ratio(DTI)'] = loan_dataset['MonthlyInstallment'] / loan_dataset['TotalIncome']

# محاسبه LTI (مقدار وام / درآمد سالانه)
loan_dataset['Loan-to-Income Ratio(LTI)'] = (loan_dataset['LoanAmount'] * 1000) / (loan_dataset['TotalIncome'] * 12)

# نمایش 10 ردیف اول
loan_dataset[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Debt-to-Income Ratio(DTI)', 'Loan-to-Income Ratio(LTI)']]


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Debt-to-Income Ratio(DTI),Loan-to-Income Ratio(LTI)
0,5849.0,0.0,154.083333,360.0,0.073176,2.195295
1,4583.0,1508.0,128.000000,360.0,0.058374,1.751218
2,3000.0,0.0,66.000000,360.0,0.061111,1.833333
3,2583.0,2358.0,120.000000,360.0,0.067463,2.023882
4,6000.0,0.0,141.000000,360.0,0.065278,1.958333
...,...,...,...,...,...,...
609,2900.0,0.0,71.000000,360.0,0.068008,2.040230
610,4106.0,0.0,40.000000,180.0,0.054121,0.811820
611,8072.0,240.0,253.000000,360.0,0.084550,2.536493
612,7583.0,0.0,187.000000,360.0,0.068501,2.055035


### Step 3 : EDA(Exploratory Data Analysis )

# Analysis of the Target Variable (Loan_Status)

First, let's examine the distribution of our target variable. Are the data balanced?

In [16]:
# محاسبه درصدها
loan_status_count = loan_dataset['Loan_Status'].value_counts().reset_index()
loan_status_count.columns = ['Loan_Status', 'Count']
loan_status_count['Percentage'] = (loan_status_count['Count'] / loan_status_count['Count'].sum() * 100).round(1)

# ترسیم با Plotly Express
fig = px.bar(
    loan_status_count,
    x='Loan_Status',
    y='Count',
    color='Loan_Status',
    text='Percentage',
    color_discrete_sequence=px.colors.sequential.Viridis
)

# تغییر ظاهر
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.update_layout(
    title='Loan Approval Status Distribution',
    xaxis_title='Loan Status (1=Approved, 0=Rejected)',
    yaxis_title='Count',
    plot_bgcolor='white'
)

fig.show()

The chart shows that approximately 68.7% of loan applications in this dataset were approved (Y), while about 31.3% were rejected (N). This indicates that our data is somewhat imbalanced. This is an important point that may impact the model evaluation stage (e.g., choosing an appropriate metric instead of Accuracy).

# Analysis of Categorical Features
Now, let's examine the other categorical variables such as gender, marital status, education, and more.

In [17]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# لیست ویژگی‌های دسته‌ای
categorical_features = [
    'Gender', 'Married', 'Dependents',
    'Education', 'Self_Employed', 'Property_Area',
    'Credit_History'
]

# ایجاد ساب‌پلات‌ها (3 سطر × 3 ستون)
rows = 3
cols = 3
fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=categorical_features
)

# اضافه کردن هر نمودار به ساب‌پلات‌ها
row = 1
col = 1
for feature in categorical_features:
    # شمارش مقادیر هر دسته
    counts = loan_dataset_EDA[feature].value_counts().reset_index()
    counts.columns = [feature, 'Count']

    # ساخت Bar Chart
    trace = go.Bar(
        x=counts[feature],
        y=counts['Count'],
        marker=dict(color='lightblue'),
        name=feature
    )

    fig.add_trace(trace, row=row, col=col)

    # تغییر مختصات ساب‌پلات
    col += 1
    if col > cols:
        col = 1
        row += 1

# آپدیت ظاهر نمودار
fig.update_layout(
    height=900,
    width=1200,
    title_text="Univariate Analysis of Categorical Variables (Plotly)",
    showlegend=False,
    plot_bgcolor='white'
)

fig.show()

- **Gender**: Approximately 80% of applicants are male.
- **Married**: About 65% of applicants are married.
- **Dependents**: The majority of applicants (around 57%) have no dependents.
- **Education**: Approximately 78% of applicants are graduates.
- **Self_Employed**: Only a small percentage (about 14%) are self-employed.
- **Property_Area**: The highest demand comes from semi-urban areas.
- **Credit_History**: The vast majority (around 85%) have a positive credit history (value 1.0). This feature appears to be very important.

# Analysis of Numerical Features

For numerical features, we will examine their distribution using histograms and box plots. These charts help us identify skewness and the presence of outliers.

In [None]:


# لیست ویژگی‌های عددی
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

# ایجاد ساب‌پلات‌ها (3 ستون، 1 سطر)
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=numerical_features
)

# اضافه کردن هر هیستوگرام به ساب‌پلات‌ها
for i, col in enumerate(numerical_features):
    hist = go.Histogram(
        x=loan_dataset[col],
        nbinsx=30,
        marker=dict(color='lightseagreen'),
        opacity=0.75,
        name=col,
        histnorm=None
    )
    
    # افزودن منحنی KDE (چگالی)
    kde = go.Histogram(
        x=loan_dataset[col],
        nbinsx=30,
        histnorm='probability density',
        marker=dict(color='orange'),
        opacity=0.3,
        showlegend=False
    )

    fig.add_trace(hist, row=1, col=i+1)
    # در Plotly، نمودار چگالی واقعی بهتر با go.Scatter ساخته می‌شود:

# نسخه بهتر با چگالی واقعی (روش جایگزین)
import numpy as np


fig = make_subplots(rows=1, cols=3, subplot_titles=numerical_features)

for i, col in enumerate(numerical_features):
    # هیستوگرام
    fig.add_trace(
        go.Histogram(
            x=loan_dataset[col],
            nbinsx=30,
            name=col,
            marker_color='lightseagreen',
            opacity=0.7
        ),
        row=1, col=i+1
    )
    
    # منحنی KDE
    kde = gaussian_kde(loan_dataset[col].dropna())
    x_range = np.linspace(loan_dataset[col].min(), loan_dataset[col].max(), 200)
    fig.add_trace(
        go.Scatter(
            x=x_range,
            y=kde(x_range) * len(loan_dataset[col]) * (loan_dataset[col].max() - loan_dataset[col].min()) / 30,
            mode='lines',
            name=f"{col} KDE",
            line=dict(color='orange')
        ),
        row=1, col=i+1
    )

fig.update_layout(
    title="Distribution Analysis of Numerical Variables ",
    height=400,
    width=1200,
    showlegend=False,
    plot_bgcolor='white'
)

fig.show()

- **ApplicantIncome and CoapplicantIncome**: Both variables exhibit right-skewed distributions. This means most applicants have relatively low incomes, with a few having significantly higher incomes (considered outliers).


# Bivariate Analysis

In this section, we will examine the relationship between each feature (both numerical and categorical) and the target variable (Loan_Status).

In [21]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# لیست ویژگی‌های دسته‌ای
categorical_features = ['Gender', 'Married', 'Dependents', 'Education',
                        'Self_Employed', 'Property_Area', 'Credit_History']

# تنظیم ساب‌پلات‌ها (3 ردیف × 3 ستون)
rows = 3
cols = 3
fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=categorical_features
)

row, col = 1, 1
for feature in categorical_features:
    # شمارش مقدار هر دسته برای هر Loan_Status
    counts = loan_dataset_EDA.groupby([feature, 'Loan_Status']).size().reset_index(name='Count')

    for status in counts['Loan_Status'].unique():
        filtered = counts[counts['Loan_Status'] == status]

        fig.add_trace(
            go.Bar(
                x=filtered[feature],
                y=filtered['Count'],
                name=f"{status}",
                marker=dict(colorscale='Portland'),
            ),
            row=row, col=col
        )

    col += 1
    if col > cols:
        col = 1
        row += 1

# بهبود ظاهر
fig.update_layout(
    height=900, width=1200,
    title_text="Analysis of Categorical Variables vs. Loan Status (Plotly)",
    barmode='group',  # ستون‌های Loan_Status کنار هم
    plot_bgcolor='white'
)

fig.show()

# Credit_History vs. Loan_Status  
This is the **most important** chart! We can clearly see that:  
- If the credit history is **positive (1.0)**, the chance of loan approval is **very high**.  
- If the credit history is **negative (0.0)**, the chance of loan approval is **very low**.  
This feature is definitely a **strong predictor**.  

# Married vs. Loan_Status  
Married applicants have a **slightly higher** chance of loan approval.  

# Education vs. Loan_Status  
Graduates have a **higher** chance of loan approval.  

# Property_Area vs. Loan_Status  
- Applications from **Semiurban** areas have the **highest** approval rate.  
- Applications from **Rural** areas have the **lowest** approval rate.  

# Numerical Variables vs. Target Variable  
Using box plots, we compare the distribution of numerical variables for approved and rejected loans.  

In [22]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# لیست ویژگی‌های عددی
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

# ایجاد ساب‌پلات‌ها: 1 ردیف و 3 ستون
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=numerical_features
)

# اضافه کردن هر boxplot به ساب‌پلات
for i, col in enumerate(numerical_features):
    fig.add_trace(
        go.Box(
            x=loan_dataset['Loan_Status'],
            y=loan_dataset[col],
            boxmean='sd',  # نمایش میانگین و انحراف معیار
            marker_color='teal',
            name=col
        ),
        row=1, col=i+1
    )

# بهبود ظاهر نمودار
fig.update_layout(
    title_text="Analysis of Numerical Variables vs. Loan Status (Plotly)",
    height=500,
    width=1200,
    showlegend=False,
    plot_bgcolor='white'
)

fig.show()

Based on the box plots, no significant difference is observed in the median (the center line of the box) of applicant income or loan amount between the approved and rejected groups. This indicates that these variables alone may not have strong predictive power, but combining them (for example, by creating a new feature like `Total_Income`) could be beneficial.

# Multivariate Analysis (Multivariate Analysis)
Finally, using a correlation matrix (Correlation Matrix), we examine the linear relationship between all numerical variables.

In [23]:
import plotly.express as px

# محاسبه ماتریس همبستگی
correlation_matrix = loan_dataset.corr(numeric_only=True)

# تبدیل به فرمت مناسب برای Plotly
corr_melted = correlation_matrix.reset_index().melt(id_vars='index')
corr_melted.columns = ['Feature1', 'Feature2', 'Correlation']

# رسم Heatmap
fig = px.imshow(
    correlation_matrix,
    text_auto=".2f",  # نمایش مقادیر روی خانه‌ها
    aspect="auto",
    color_continuous_scale='RdBu',
    origin='upper'
)

fig.update_layout(
    title='Correlation Matrix Between Variables (Plotly)',
    xaxis_title="Features",
    yaxis_title="Features",
    width=900,
    height=800
)

fig.show()

- **Credit_History** shows the highest correlation (0.54) with **Loan_Status**, confirming our previous analysis.

- **ApplicantIncome** and **LoanAmount** have a positive correlation (0.56), which is logical (higher-income individuals tend to request larger loans).

- **Married** and **Dependents** also show positive correlation with each other.

- Importantly, we observe no severe multicollinearity between independent features that could potentially cause issues for linear models.