In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [79]:
data = {
    'age': [25, 45, 58, 33, 29, 41, 36, 44, 53, 27, np.nan, 39, 48, 36, 42],
    'income': [45, 120, 98, 60, 75, 65, 50, 210, 95, 40, 70, 88, 102, 83, 68],
    'loan_amount': [20, 80, 50, 35, 40, 30, 25, 150, 45, 15, 32, 38, 55, 42, 28],
    'credit_score': [650, 720, 810, 680, 590, 640, 610, 830, 740, 600, 690, 710, 780, 670, 625],
    'employment_length': ['1-3yr', '5-10yr', '10+yr', '3-5yr', '<1yr', '1-3yr', '1-3yr', '10+yr', '5-10yr', '<1yr', '3-5yr', '5-10yr', '10+yr', '3-5yr', '1-3yr'],
    'home_ownership': ['Rent', 'Mortgage', 'Own', 'Rent', 'Rent', 'Rent', 'Rent', 'Own', 'Mortgage', 'Rent', 'Mortgage', 'Mortgage', 'Own', 'Mortgage', 'Rent'],
    'loan_intent': ['Personal', 'Venture', 'Home', 'Education', 'Personal', 'Medical', 'Personal', 'Venture', 'Home', 'Education', 'Debt Consolidation', 'Home', 'Venture', 'Medical', 'Debt Consolidation'],
    'loan_status': [1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1]
}

In [80]:
df=pd.DataFrame(data)

# Now lets explore the data set

In [81]:
df

Unnamed: 0,age,income,loan_amount,credit_score,employment_length,home_ownership,loan_intent,loan_status
0,25.0,45,20,650,1-3yr,Rent,Personal,1
1,45.0,120,80,720,5-10yr,Mortgage,Venture,0
2,58.0,98,50,810,10+yr,Own,Home,0
3,33.0,60,35,680,3-5yr,Rent,Education,0
4,29.0,75,40,590,<1yr,Rent,Personal,1
5,41.0,65,30,640,1-3yr,Rent,Medical,1
6,36.0,50,25,610,1-3yr,Rent,Personal,1
7,44.0,210,150,830,10+yr,Own,Venture,0
8,53.0,95,45,740,5-10yr,Mortgage,Home,0
9,27.0,40,15,600,<1yr,Rent,Education,1


In [82]:
df.shape

(15, 8)

In [83]:
df.describe()

Unnamed: 0,age,income,loan_amount,credit_score,loan_status
count,14.0,15.0,15.0,15.0,15.0
mean,39.714286,84.6,45.666667,689.666667,0.466667
std,9.603113,41.324154,32.906071,75.225059,0.516398
min,25.0,40.0,15.0,590.0,0.0
25%,33.75,62.5,29.0,632.5,0.0
50%,40.0,75.0,38.0,680.0,0.0
75%,44.75,96.5,47.5,730.0,1.0
max,58.0,210.0,150.0,830.0,1.0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                14 non-null     float64
 1   income             15 non-null     int64  
 2   loan_amount        15 non-null     int64  
 3   credit_score       15 non-null     int64  
 4   employment_length  15 non-null     object 
 5   home_ownership     15 non-null     object 
 6   loan_intent        15 non-null     object 
 7   loan_status        15 non-null     int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 1.1+ KB


In [85]:
df.isnull().sum()

age                  1
income               0
loan_amount          0
credit_score         0
employment_length    0
home_ownership       0
loan_intent          0
loan_status          0
dtype: int64

In [86]:
df['age']

0     25.0
1     45.0
2     58.0
3     33.0
4     29.0
5     41.0
6     36.0
7     44.0
8     53.0
9     27.0
10     NaN
11    39.0
12    48.0
13    36.0
14    42.0
Name: age, dtype: float64

In [87]:
df['age'].fillna(df['age'].mean(),inplace=True)

In [88]:
df.isnull().sum()

age                  0
income               0
loan_amount          0
credit_score         0
employment_length    0
home_ownership       0
loan_intent          0
loan_status          0
dtype: int64

In [89]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns to encode:", list(categorical_cols))

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

Categorical columns to encode: ['employment_length', 'home_ownership', 'loan_intent']


# so we have sucessfully done the cleaning part now lets split the feature & target

In [90]:
X=df.drop('loan_status', axis = 1)
y=df['loan_status']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (12, 7) Test shape: (3, 7)


# lets scale the data its important before we put it for regresssion

In [91]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# now lets Train our model 

In [98]:
Model= LogisticRegression(random_state=42)
Model.fit(X_train_scaled, y_train)

In [95]:
Model_tree = DecisionTreeClassifier(random_state=42)
Model_tree.fit(X_train, y_train)

# lets make predictions and check accuracy

In [99]:
y_pred_lr = Model.predict(X_test_scaled)
y_pred_dt = Model_tree.predict(X_test)

In [100]:
acc_lr = accuracy_score(y_test, y_pred_lr)
acc_dt = accuracy_score(y_test, y_pred_dt)

print(f"\nLogistic Regression Accuracy: {acc_lr:.4f} ({acc_lr*100:.1f}%)")
print(f"Decision Tree Accuracy: {acc_dt:.4f} ({acc_dt*100:.1f}%)")

# Step 10: Check if target achieved
if acc_lr > 0.85 or acc_dt > 0.85:
    print("✅ Target achieved! Accuracy > 85%")
else:
    print("❌ Target not achieved. Try tuning parameters")


Logistic Regression Accuracy: 1.0000 (100.0%)
Decision Tree Accuracy: 1.0000 (100.0%)
✅ Target achieved! Accuracy > 85%


In [101]:
print("Class distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in test set:")
print(y_test.value_counts())

Class distribution in training set:
loan_status
0    7
1    5
Name: count, dtype: int64

Class distribution in test set:
loan_status
1    2
0    1
Name: count, dtype: int64


#  📝 Conclusion

### Results Summary:
- **Logistic Regression Accuracy**: 100.0%
- **Decision Tree Accuracy**: 100.0%
- **Target Achievement**: ✅ Successfully exceeded 85% accuracy goal

### Key Insights:
1. Both models performed perfectly on the test set, achieving 100% accuracy
2. The small dataset size (15 samples) may contribute to this perfect score
3. Feature engineering and proper preprocessing were crucial for model performance

### Limitations:
- Very small dataset may lead to overfitting
- Limited test set (only 3 samples) may not represent real-world performance
- Need more data for reliable model validation

### Recommendations:
1. **Collect more data** for better model generalization
2. **Implement cross-validation** for more robust evaluation
3. **Monitor for overfitting** with learning curves
4. **Test on larger, unseen datasets** to verify real-world performance

### Business Impact:
These models show strong potential for predicting loan defaults, which could significantly reduce financial risks for lending institutions when deployed with sufficient data.