In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load data
data = pd.read_csv('loansdata.csv')

# Print column names to check for 'Loan.Length'
print(data.columns.tolist())

# Strip any leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Clean 'Loan.Length' column
data['Loan.Length'] = data['Loan.Length'].str.replace(' months', '').str.strip()

# Handle NaN values in 'Loan.Length'
data['Loan.Length'] = data['Loan.Length'].fillna('0')  # Example: filling NaNs with '0'
data = data[data['Loan.Length'].str.isnumeric()]
data['Loan.Length'] = data['Loan.Length'].astype(int)

# Encode categorical variables
data = pd.get_dummies(data, drop_first=True)

# Define features and target variable
X = data.drop('Interest.Rate', axis=1)
y = data['Interest.Rate']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared: {r2}')


['ID', 'Amount.Requested', 'Amount.Funded.By.Investors', 'Interest.Rate', 'Loan.Length', 'Loan.Purpose', 'Debt.To.Income.Ratio', 'Home.Ownership', 'Monthly.Income', 'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance', 'Inquiries.in.the.Last.6.Months', 'Employment.Length']
MAE: 0.028383801534704947
MSE: 0.0011860717522526349
R-squared: 0.32761091580166446


In [10]:
data = pd.get_dummies(data, drop_first=True)

In [11]:
data.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Loan.Length_36 months,...,Home.Ownership_RENT,Employment.Length_10+ years,Employment.Length_2 years,Employment.Length_3 years,Employment.Length_4 years,Employment.Length_5 years,Employment.Length_6 years,Employment.Length_7 years,Employment.Length_8 years,Employment.Length_9 years
0,81174.0,20000,20000.0,0.09,0.15,6541.67,14,14272,2,1,...,0,0,0,0,0,0,0,0,0,0
1,99592.0,19200,19200.0,0.12,0.28,4583.33,12,11140,1,1,...,0,0,1,0,0,0,0,0,0,0
2,80059.0,35000,35000.0,0.22,0.24,11500.0,14,21977,1,0,...,0,0,1,0,0,0,0,0,0,0
3,15825.0,10000,9975.0,0.1,0.14,3833.33,10,9346,0,1,...,0,0,0,0,0,1,0,0,0,0
4,33182.0,12000,12000.0,0.12,0.19,3195.0,11,14469,0,1,...,1,0,0,0,0,0,0,0,0,1


In [13]:
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2495    False
2496    False
2497    False
2498    False
2499    False
Length: 2500, dtype: bool

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 37 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               2499 non-null   float64
 1   Amount.Requested                 2500 non-null   int64  
 2   Amount.Funded.By.Investors       2500 non-null   float64
 3   Interest.Rate                    2500 non-null   float64
 4   Debt.To.Income.Ratio             2500 non-null   float64
 5   Monthly.Income                   2500 non-null   float64
 6   Open.CREDIT.Lines                2500 non-null   int64  
 7   Revolving.CREDIT.Balance         2500 non-null   int64  
 8   Inquiries.in.the.Last.6.Months   2500 non-null   int64  
 9   Loan.Length_36 months            2500 non-null   uint8  
 10  Loan.Length_60 months            2500 non-null   uint8  
 11  Loan.Purpose_credit_card         2500 non-null   uint8  
 12  Loan.Purpose_debt_co

In [17]:
cat_col = [col for col in data.columns if data[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in data.columns if data[col].dtype != 'object']
print('Numerical columns :',num_col)

Categorical columns : []
Numerical columns : ['ID', 'Amount.Requested', 'Amount.Funded.By.Investors', 'Interest.Rate', 'Debt.To.Income.Ratio', 'Monthly.Income', 'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance', 'Inquiries.in.the.Last.6.Months', 'Loan.Length_36 months', 'Loan.Length_60 months', 'Loan.Purpose_credit_card', 'Loan.Purpose_debt_consolidation', 'Loan.Purpose_educational', 'Loan.Purpose_home_improvement', 'Loan.Purpose_house', 'Loan.Purpose_major_purchase', 'Loan.Purpose_medical', 'Loan.Purpose_moving', 'Loan.Purpose_other', 'Loan.Purpose_renewable_energy', 'Loan.Purpose_small_business', 'Loan.Purpose_vacation', 'Loan.Purpose_wedding', 'Home.Ownership_NONE', 'Home.Ownership_OTHER', 'Home.Ownership_OWN', 'Home.Ownership_RENT', 'Employment.Length_10+ years', 'Employment.Length_2 years', 'Employment.Length_3 years', 'Employment.Length_4 years', 'Employment.Length_5 years', 'Employment.Length_6 years', 'Employment.Length_7 years', 'Employment.Length_8 years', 'Employment.Length_9 

In [18]:
data[cat_col].nunique()

Series([], dtype: float64)