In [3]:
import pandas as pd

columns_needed = [
    'loan_amnt',
    'annual_inc',
    'loan_status',
    'term',
    'emp_length',
    'home_ownership',
    'purpose'
]

df = pd.read_csv("lending_club.csv", usecols=columns_needed, nrows=10000)
df.head()


Unnamed: 0,loan_amnt,term,emp_length,home_ownership,annual_inc,loan_status,purpose
0,3600.0,36 months,10+ years,MORTGAGE,55000.0,Fully Paid,debt_consolidation
1,24700.0,36 months,10+ years,MORTGAGE,65000.0,Fully Paid,small_business
2,20000.0,60 months,10+ years,MORTGAGE,63000.0,Fully Paid,home_improvement
3,35000.0,60 months,10+ years,MORTGAGE,110000.0,Current,debt_consolidation
4,10400.0,60 months,3 years,MORTGAGE,104433.0,Fully Paid,major_purchase


In [4]:
df.rename(columns={
    'loan_amnt': 'loan_amount',
    'annual_inc': 'income'
}, inplace=True)


In [5]:
df.dropna(subset=['loan_amount', 'income', 'loan_status'], inplace=True)


In [6]:
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if 'Fully Paid' in x else 0)


In [8]:
import numpy as np

df['vehicle_type'] = np.random.choice(['bike', 'car'], size=len(df))
df['monthly_units'] = np.random.randint(100, 600, size=len(df))  # fake units between 100–600


In [9]:
def get_eco_score(vehicle, units):
    score = 0
    if vehicle == 'bike':
        score += 1
    if units < 300:
        score += 1
    return score

df['eco_score'] = df.apply(lambda row: get_eco_score(row['vehicle_type'], row['monthly_units']), axis=1)


In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['vehicle_type'] = le.fit_transform(df['vehicle_type'])  # bike=0, car=1


In [11]:
df_final = df[['income', 'loan_amount', 'monthly_units', 'vehicle_type', 'eco_score', 'loan_status']]
df_final.head()


Unnamed: 0,income,loan_amount,monthly_units,vehicle_type,eco_score,loan_status
0,55000.0,3600.0,533,1,0,1
1,65000.0,24700.0,540,1,0,1
2,63000.0,20000.0,416,0,1,1
3,110000.0,35000.0,237,0,2,0
4,104433.0,10400.0,220,0,2,1


In [12]:
df_final.to_csv("clean_midoriloan_data.csv", index=False)
