In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
file_path = '/content/loans data.csv'
data = pd.read_csv(file_path)
print(data.head())

        ID  Amount.Requested  Amount.Funded.By.Investors  Interest.Rate  \
0  81174.0             20000                     20000.0           0.09   
1  99592.0             19200                     19200.0           0.12   
2  80059.0             35000                     35000.0           0.22   
3  15825.0             10000                      9975.0           0.10   
4  33182.0             12000                     12000.0           0.12   

  Loan.Length        Loan.Purpose  Debt.To.Income.Ratio Home.Ownership  \
0   36 months  debt_consolidation                  0.15       MORTGAGE   
1   36 months  debt_consolidation                  0.28       MORTGAGE   
2   60 months  debt_consolidation                  0.24       MORTGAGE   
3   36 months  debt_consolidation                  0.14       MORTGAGE   
4   36 months         credit_card                  0.19           RENT   

   Monthly.Income  Open.CREDIT.Lines  Revolving.CREDIT.Balance  \
0         6541.67                 14  

**Finding missing values**

In [None]:
null_values = data.isnull().sum()
print(null_values)

ID                                 1
Amount.Requested                   0
Amount.Funded.By.Investors         0
Interest.Rate                      0
Loan.Length                        1
Loan.Purpose                       1
Debt.To.Income.Ratio               0
Home.Ownership                     1
Monthly.Income                     0
Open.CREDIT.Lines                  0
Revolving.CREDIT.Balance           0
Inquiries.in.the.Last.6.Months     0
Employment.Length                 78
dtype: int64


**Drop the 'ID' column as it is irrelevant for model training**


In [None]:
data_cleaned = data.drop(columns=['ID'])

**Drop rows with missing values in 'Loan.Length', 'Loan.Purpose', and 'Home.Ownership' as these are important categorical features**

In [None]:
data_cleaned = data_cleaned.dropna(subset=['Loan.Length', 'Loan.Purpose', 'Home.Ownership'])


**Fill missing values in 'Employment.Length' with 'Unknown' as it is categorical**

In [None]:
data_cleaned['Employment.Length'] = data_cleaned['Employment.Length'].fillna('Unknown')

**categorical variables using one-hot encoding Encoding**

In [None]:
data_encoded = pd.get_dummies(data_cleaned,
                              columns=['Loan.Length', 'Loan.Purpose', 'Home.Ownership', 'Employment.Length'],
                              drop_first=True)

**Defining the features (X) and target (y)**

In [None]:
X = data_encoded.drop(columns=['Interest.Rate'])
y = data_encoded['Interest.Rate']

**Splitting the data into training and testing sets (80% training, 20% testing)**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Initializing and training the linear regression model**

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

**Making predictions on the test set**

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

[0.1301618  0.10087657 0.09404343 0.12799945 0.14254393 0.1226833
 0.09970528 0.14715062 0.14755332 0.09250446 0.12271389 0.11084565
 0.12259519 0.10792097 0.11871106 0.15348612 0.12710749 0.17071103
 0.14649595 0.11355961 0.14083473 0.09944699 0.10054619 0.16995897
 0.12431173 0.12687008 0.10626968 0.10003287 0.13062472 0.11235767
 0.12595791 0.10665257 0.12867906 0.12506118 0.12258161 0.15984266
 0.11480989 0.12429209 0.13402642 0.1096793  0.1298854  0.13534477
 0.11216849 0.16379769 0.09720712 0.09949365 0.12112806 0.12905655
 0.12855025 0.18681449 0.11311212 0.12351596 0.09323505 0.11849533
 0.12432211 0.16645034 0.12530094 0.11822768 0.11679013 0.12661977
 0.13211075 0.11573349 0.10845464 0.11505601 0.11943855 0.13208824
 0.10938434 0.15570203 0.10292625 0.14039263 0.15174482 0.13129531
 0.12678348 0.17119169 0.13063613 0.11808126 0.12529989 0.1132054
 0.11443544 0.12571701 0.12049337 0.16368485 0.1476713  0.12806175
 0.15010063 0.16069646 0.12624574 0.12050158 0.11050005 0.135075

**Evaluating the model**

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared Score (R²): {r2}")

Mean Squared Error (MSE): 0.0012509869148484556
R-squared Score (R²): 0.28425643200533435
