In [436]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [437]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


#### Check Duplicate Rows

In [439]:
duplicate_rows = df.duplicated().sum()

print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [440]:
df.shape

(45000, 14)

### Encoding

In [442]:
df.person_education.value_counts()

person_education
Bachelor       13399
Associate      12028
High School    11972
Master          6980
Doctorate        621
Name: count, dtype: int64

#### Encode person_education

In [444]:
education_mapping = {
    'Bachelor':0, 'Associate':1, 'High School':2, 'Master':3, 'Doctorate':4
}

df['education_level'] = df.person_education.map(education_mapping)

In [445]:
df.person_gender.value_counts()

person_gender
male      24841
female    20159
Name: count, dtype: int64

#### Encode Gender

In [447]:
gender_mapping = {'female':0, 'male':1 }

df['gender'] = df.person_gender.map(gender_mapping)

#### Encode Home Ownership (using OneHotEncoding)

In [449]:
df.person_home_ownership.value_counts()

person_home_ownership
RENT        23443
MORTGAGE    18489
OWN          2951
OTHER         117
Name: count, dtype: int64

In [450]:
encoded = pd.get_dummies(df['person_home_ownership'], dtype=int)

df = pd.concat([df, encoded], axis=1)

df = df.drop(['person_home_ownership'], axis=1)

#### Encode Loan Intent (using OneHotEncoding)

In [452]:
df.loan_intent.value_counts()

loan_intent
EDUCATION            9153
MEDICAL              8548
VENTURE              7819
PERSONAL             7552
DEBTCONSOLIDATION    7145
HOMEIMPROVEMENT      4783
Name: count, dtype: int64

In [453]:
encoded = pd.get_dummies(df['loan_intent'], dtype=int)
df = pd.concat([df, encoded], axis=1)

df = df.drop(['loan_intent'], axis=1)

In [454]:
df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,...,MORTGAGE,OTHER,OWN,RENT,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE
0,22.0,female,Master,71948.0,0,35000.0,16.02,0.49,3.0,561,...,0,0,0,1,0,0,0,0,1,0
1,21.0,female,High School,12282.0,0,1000.0,11.14,0.08,2.0,504,...,0,0,1,0,0,1,0,0,0,0
2,25.0,female,High School,12438.0,3,5500.0,12.87,0.44,3.0,635,...,1,0,0,0,0,0,0,1,0,0
3,23.0,female,Bachelor,79753.0,0,35000.0,15.23,0.44,2.0,675,...,0,0,0,1,0,0,0,1,0,0
4,24.0,male,Master,66135.0,1,35000.0,14.27,0.53,4.0,586,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,15000.0,15.66,0.31,3.0,645,...,0,0,0,1,0,0,0,1,0,0
44996,37.0,female,Associate,65800.0,17,9000.0,14.07,0.14,11.0,621,...,0,0,0,1,0,0,1,0,0,0
44997,33.0,male,Associate,56942.0,7,2771.0,10.02,0.05,10.0,668,...,0,0,0,1,1,0,0,0,0,0
44998,29.0,male,Bachelor,33164.0,4,12000.0,13.23,0.36,6.0,604,...,0,0,0,1,0,1,0,0,0,0


#### Encode previous_loan_defaults_on_file

In [456]:
df.previous_loan_defaults_on_file.value_counts()

previous_loan_defaults_on_file
Yes    22858
No     22142
Name: count, dtype: int64

In [457]:
previous_loan = {'No':0, "Yes":1}
df['has_previous_loan'] = df.previous_loan_defaults_on_file.map(previous_loan)

#### Drop String type columns after Encoding

In [459]:
df = df.drop(['person_education', 'person_gender', 'previous_loan_defaults_on_file'], axis=1)
df

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,education_level,...,OTHER,OWN,RENT,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,has_previous_loan
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,3,...,0,0,1,0,0,0,0,1,0,0
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,2,...,0,1,0,0,1,0,0,0,0,1
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,2,...,0,0,0,0,0,0,1,0,0,0
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,0,...,0,0,1,0,0,0,1,0,0,0
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,3,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,47971.0,6,15000.0,15.66,0.31,3.0,645,1,1,...,0,0,1,0,0,0,1,0,0,0
44996,37.0,65800.0,17,9000.0,14.07,0.14,11.0,621,1,1,...,0,0,1,0,0,1,0,0,0,0
44997,33.0,56942.0,7,2771.0,10.02,0.05,10.0,668,1,1,...,0,0,1,1,0,0,0,0,0,0
44998,29.0,33164.0,4,12000.0,13.23,0.36,6.0,604,1,0,...,0,0,1,0,1,0,0,0,0,0


In [460]:
df.columns

Index(['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'loan_status', 'education_level', 'gender', 'MORTGAGE',
       'OTHER', 'OWN', 'RENT', 'DEBTCONSOLIDATION', 'EDUCATION',
       'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE',
       'has_previous_loan'],
      dtype='object')

In [461]:
x = df.drop('loan_status', axis=1).values  
y = df['loan_status'].values

In [462]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#### Scale the Data

In [464]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train) 
x_test_scaled = scaler.transform(x_test) 

#### Create the Model

In [467]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_scaled, y_train)

In [468]:
score = model.score(x_test_scaled, y_test)
print(f"Model Accuracy: {score:2f}")

Model Accuracy: 0.896889


In [469]:
y_pred = model.predict(x_test_scaled)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [470]:
y_test

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)