In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

In [2]:
sample_data = pd.read_csv(r'Sample Submission.csv')

In [3]:
test_data = pd.read_csv(r'Test.csv')
train_data = pd.read_csv(r'Train.csv')

# dataset info
train_data_info = train_data.info(), train_data.head()
test_data_info = test_data.info(), test_data.head()

train_data_info, test_data_info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5898 entries, 0 to 5897
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 5898 non-null   int64  
 1   Loan_ID            5898 non-null   object 
 2   Gender             5898 non-null   int64  
 3   Married            5898 non-null   int64  
 4   Dependents         5898 non-null   object 
 5   Education          5898 non-null   int64  
 6   Self_Employed      5898 non-null   int64  
 7   ApplicantIncome    5898 non-null   int64  
 8   CoapplicantIncome  5898 non-null   float64
 9   LoanAmount         5898 non-null   int64  
 10  Loan_Amount_Term   5898 non-null   int64  
 11  Credit_History     5898 non-null   int64  
 12  Property_Area      5898 non-null   int64  
 13  Loan_Status        5898 non-null   int64  
 14  Total_Income       5898 non-null   int64  
dtypes: float64(1), int64(12), object(2)
memory usage: 691.3+ KB
<class 'pand

((None,
        ID   Loan_ID  Gender  Married Dependents  Education  Self_Employed  \
  0  74768  LP002231       1        1          0          1              0   
  1  79428  LP001448       1        1          0          0              0   
  2  70497  LP002231       0        0          0          0              0   
  3  87480  LP001385       1        1          0          0              0   
  4  33964  LP002231       1        1          1          0              0   
  
     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
  0             8328           0.000000          17               363   
  1              150        3857.458782         188               370   
  2             4989         314.472511          17               348   
  3              150           0.000000         232               359   
  4             8059           0.000000          17               372   
  
     Credit_History  Property_Area  Loan_Status  Total_Income  
  0             

In [4]:
# Display missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


ID                   0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
Total_Income         0
dtype: int64
ID                   0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Total_Income         0
dtype: int64


In [None]:
binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History']
label_enc = LabelEncoder()

for col in binary_cols:
    train_data[col] = label_enc.fit_transform(train_data[col])
    test_data[col] = label_enc.transform(test_data[col])

# One-hot encode other categorical columns
train_data = pd.get_dummies(train_data, columns=['Property_Area', 'Dependents'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Property_Area', 'Dependents'], drop_first=True)

In [5]:
# Check the columns in both train and test data
print("Train Columns:", train_data.columns)
print("Test Columns:", test_data.columns)

# Inspect unique values if columns exist
if 'Property_Area' in train_data.columns:
    print("Unique values in 'Property_Area' (Train):", train_data['Property_Area'].unique())
else:
    print("'Property_Area' column is missing in train_data.")

if 'Dependents' in train_data.columns:
    print("Unique values in 'Dependents' (Train):", train_data['Dependents'].unique())
else:
    print("'Dependents' column is missing in train_data.")

if 'Property_Area' in test_data.columns:
    print("Unique values in 'Property_Area' (Test):", test_data['Property_Area'].unique())
else:
    print("'Property_Area' column is missing in test_data.")

if 'Dependents' in test_data.columns:
    print("Unique values in 'Dependents' (Test):", test_data['Dependents'].unique())
else:
    print("'Dependents' column is missing in test_data.")


Train Columns: Index(['ID', 'Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Total_Income'],
      dtype='object')
Test Columns: Index(['ID', 'Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Total_Income'],
      dtype='object')
Unique values in 'Property_Area' (Train): [2 1 0]
Unique values in 'Dependents' (Train): ['0' '1' '3+' '2']
Unique values in 'Property_Area' (Test): [1 2 0]
Unique values in 'Dependents' (Test): ['0' '1' '3+' '2']


In [6]:
# Map integer values in 'Property_Area' back to category labels
property_area_mapping = {0: 'Urban', 1: 'Rural', 2: 'Semiurban'}
train_data['Property_Area'] = train_data['Property_Area'].map(property_area_mapping)
test_data['Property_Area'] = test_data['Property_Area'].map(property_area_mapping)

# One-hot encode 'Property_Area' and 'Dependents' columns in both datasets
train_data = pd.get_dummies(train_data, columns=['Property_Area', 'Dependents'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Property_Area', 'Dependents'], drop_first=True)

# Verify the updated columns
print("Updated Train Columns:", train_data.columns)
print("Updated Test Columns:", test_data.columns)


Updated Train Columns: Index(['ID', 'Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Total_Income',
       'Property_Area_Semiurban', 'Property_Area_Urban', 'Dependents_1',
       'Dependents_2', 'Dependents_3+'],
      dtype='object')
Updated Test Columns: Index(['ID', 'Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Total_Income',
       'Property_Area_Semiurban', 'Property_Area_Urban', 'Dependents_1',
       'Dependents_2', 'Dependents_3+'],
      dtype='object')


In [14]:
# 1. Income-to-Loan Ratio
train_data['Income_to_Loan_Ratio'] = train_data['Total_Income'] / train_data['LoanAmount']
test_data['Income_to_Loan_Ratio'] = test_data['Total_Income'] / test_data['LoanAmount']

# 2. Credit History Impact
train_data['Credit_History_Impact'] = train_data['Credit_History'].apply(lambda x: 1 if x == 1 else 0)
test_data['Credit_History_Impact'] = test_data['Credit_History'].apply(lambda x: 1 if x == 1 else 0)

# 3. Loan Term Category
def categorize_loan_term(term):
    if term <= 180:   # Short term (less than or equal to 6 years)
        return 'short'
    elif term <= 360: # Medium term (6-12 years)
        return 'medium'
    else:             # Long term (more than 12 years)
        return 'long'

train_data['Loan_Term_Category'] = train_data['Loan_Amount_Term'].apply(categorize_loan_term)
test_data['Loan_Term_Category'] = test_data['Loan_Amount_Term'].apply(categorize_loan_term)

# Convert Loan_Term_Category to dummy variables
train_data = pd.get_dummies(train_data, columns=['Loan_Term_Category'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Loan_Term_Category'], drop_first=True)

# 4. Income-Loan Interaction Term
train_data['Income_Loan_Interaction'] = train_data['ApplicantIncome'] * train_data['LoanAmount']
test_data['Income_Loan_Interaction'] = test_data['ApplicantIncome'] * test_data['LoanAmount']

# Check the head of the new features
train_data.head()

Unnamed: 0,ID,Loan_ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Property_Area_Semiurban,Property_Area_Urban,Dependents_1,Dependents_2,Dependents_3+,Income_to_Loan_Ratio,Credit_History_Impact,Loan_Term_Category_medium,Loan_Term_Category_short,Income_Loan_Interaction
0,74768,LP002231,1,1,1,0,8328,0.0,17,363,...,True,False,False,False,False,352.941176,1,False,False,141576
1,79428,LP001448,1,1,0,0,150,3857.458782,188,370,...,False,False,False,False,False,31.914894,1,False,False,28200
2,70497,LP002231,0,0,0,0,4989,314.472511,17,348,...,False,True,False,False,False,352.941176,1,True,False,84813
3,87480,LP001385,1,1,0,0,150,0.0,232,359,...,False,False,False,False,False,16.163793,1,True,False,34800
4,33964,LP002231,1,1,0,0,8059,0.0,17,372,...,False,True,True,False,False,220.588235,1,False,False,137003


In [16]:
# Scale the features
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_val_scaled = scaler.transform(X_val)

# Re-train the model with increased iterations
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_smote_scaled, y_train_smote)

# Predictions
y_pred = model.predict(X_val_scaled)

# Evaluation
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print("Validation Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Validation Accuracy: 0.6093220338983051
Confusion Matrix:
 [[ 64 123]
 [338 655]]
Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.34      0.22       187
           1       0.84      0.66      0.74       993

    accuracy                           0.61      1180
   macro avg       0.50      0.50      0.48      1180
weighted avg       0.73      0.61      0.66      1180

