In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


# 1. Dataset Loading
df = pd.read_csv("loan_data.csv")  # Replace "loan_data.csv" with your file


# 2. Exploratory Data Analysis (EDA)
print(df.head())
print(df.describe())
print(df.info())
# Further EDA (e.g., correlation analysis, visualizations) can be added as needed



# 3. Handling Missing Data
# Check for missing values
print(df.isnull().sum())

# Handling missing values (example strategies)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True) 
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)  # Numerical feature - using median
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)


# 4. Data Encoding and Scaling
le = LabelEncoder()
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])



scaler = MinMaxScaler()
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']  # Add relevant columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# 5. Data Splitting
X = df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# 6. Model Training & 7. Model Evaluation (Combined for brevity)
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test,y_pred))  # Include classification report


# 8. Final Analysis and Model Selection
best_model = max(results, key=results.get)
print(f"\nBest Performing Model: {best_model} with accuracy {results[best_model]:.4f}")

# Save the best model (example using pickle)
import pickle
with open("best_loan_model.pkl", "wb") as f:
    pickle.dump(models[best_model], f)



# Example of how to interpret feature importance (if applicable to the best model)
if best_model == "Random Forest": #check if the chosen model is indeed random forest before proceeding
    feature_importances = models["Random Forest"].feature_importances_
    print("\nFeature Importances:")
    for i, feature in enumerate(X.columns):
        print(f"{feature}: {feature_importances[i]:.4f}")


# You could add further analysis like plotting ROC curves, confusion matrices, etc., here
# to improve the model evaluation and selection process.

https://app.houshyar24.ir/dashboard/ai-agent/test?agent=19574
