<a href="https://colab.research.google.com/github/Dysuza/Datascience/blob/main/Loan_Eligibility_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

In [32]:
train_data = pd.read_csv('train_ctrUa4K.csv')
test_data = pd.read_csv('test_lAUu6dG.csv')


In [33]:
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status']

In [34]:
test_ids = test_data['Loan_ID']
combined_data = pd.concat([X, test_data.drop(columns=['Loan_ID'])], axis=0)

imputer = SimpleImputer(strategy='mean')

In [35]:
combined_data['LoanAmount'] = imputer.fit_transform(combined_data[['LoanAmount']])
combined_data['Loan_Amount_Term'] = imputer.fit_transform(combined_data[['Loan_Amount_Term']])
combined_data['Credit_History'] = imputer.fit_transform(combined_data[['Credit_History']])

In [36]:
categorical_columns = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Property_Area', 'Education']
combined_data[categorical_columns] = combined_data[categorical_columns].fillna(combined_data[categorical_columns].mode().iloc[0])

In [37]:
encoder = LabelEncoder()
for column in categorical_columns:
  combined_data[column] = encoder.fit_transform(combined_data[column])

In [38]:
train_processed = combined_data[:len(train_data)]
test_processed = combined_data[len(train_data):]

In [39]:
scaler = StandardScaler()
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
train_processed[numerical_columns] = scaler.fit_transform(train_processed[numerical_columns])
test_processed[numerical_columns] = scaler.transform(test_processed[numerical_columns])

y = y.map({'Y': 1, 'N': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_processed[numerical_columns] = scaler.fit_transform(train_processed[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_processed[numerical_columns] = scaler.transform(test_processed[numerical_columns])


In [40]:
X_train, X_val, y_train, y_val = train_test_split(train_processed, y, test_size=0.2, random_state=42)

In [41]:
selector = SelectKBest(f_classif, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
X_val_selected = selector.transform(X_val)

In [42]:
log_reg = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [48]:
y_val_pred = best_model.predict(X_val_selected)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Vallidation Accuracy: {accuracy * 100:.2f}%')



Vallidation Accuracy: 78.86%


In [49]:
conf_matrix = confusion_matrix(y_val, y_val_pred)
print(" Matrix:\n", conf_matrix)

 Matrix:
 [[18 25]
 [ 1 79]]


In [50]:
train_processed_selected = selector.fit_transform(train_processed, y)
test_processed_selected = selector.transform(test_processed)
best_model.fit(train_processed_selected, y)

test_predictions = best_model.predict(test_processed_selected)

In [47]:
submission = pd.DataFrame({'Loan_ID': test_ids, 'Loan_Status': ['Y' if pred == 1 else 'N' for pred in test_predictions]})
submission.to_csv('output.csv', index=False)
print('Save submission file as output.csv')

Save submission file as output.csv
