In [54]:
# Generate predictions for the test set using the trained model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [55]:
# Import the dataset
data = pd.read_csv("diabetes.csv")
# Displaying the first few rows of de DataFrame for inspections
data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Progression
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [56]:
# Displaying information about the DataFrame, including data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          442 non-null    float64
 1   sex          442 non-null    float64
 2   bmi          442 non-null    float64
 3   bp           442 non-null    float64
 4   s1           442 non-null    float64
 5   s2           442 non-null    float64
 6   s3           442 non-null    float64
 7   s4           442 non-null    float64
 8   s5           442 non-null    float64
 9   s6           442 non-null    float64
 10  Progression  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [57]:
# X contains all columns except the 'Progression' column, which is the dependent variable
X = data.drop('Progression', axis=1)

# y contains only the 'Progression' column, which is the dependent variable
y = data['Progression']

In [58]:
# Splitting the data into training and testing sets, using 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

print("Training data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)                                                    

Training data: (353, 10) (353,)
Test data: (89, 10) (89,)


In [59]:
# Initialize the scalers
scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

# Fit the scalers on the train data
X_train_scaled_standard = scaler.fit_transform(X_train)
X_train_scaled_minmax = min_max_scaler.fit_transform(X_train)

# Transform the test data using the fitted scalers
X_test_scaled_standard = scaler.transform(X_test)
X_test_scaled_minmax = min_max_scaler.transform(X_test)

In [60]:
# Create an empty linear regression model
diabetes_model = LinearRegression()

# Fit the model on the scaled training data with all independent variables
diabetes_model.fit(X_train_scaled_standard, y_train)

# Printing the intercept and coefficients of the trained model
print("Intercept:", diabetes_model.intercept_)
print("Coefficients:", diabetes_model.coef_)

Intercept: 153.73654390934846
Coefficients: [  1.75375799 -11.51180908  25.60712144  16.82887167 -44.44885564
  24.64095356   7.67697768  13.1387839   35.16119521   2.35136365]


In [61]:
# Generate predictions for the test set using the trained model
y_pred = diabetes_model.predict(X_test_scaled_standard)
y_pred

array([139.5475584 , 179.51720835, 134.03875572, 291.41702925,
       123.78965872,  92.1723465 , 258.23238899, 181.33732057,
        90.22411311, 108.63375858,  94.13865744, 168.43486358,
        53.5047888 , 206.63081659, 100.12925869, 130.66657085,
       219.53071499, 250.7803234 , 196.3688346 , 218.57511815,
       207.35050182,  88.48340941,  70.43285917, 188.95914235,
       154.8868162 , 159.36170122, 188.31263363, 180.39094033,
        47.99046561, 108.97453871, 174.77897633,  86.36406656,
       132.95761215, 184.53819483, 173.83220911, 190.35858492,
       124.4156176 , 119.65110656, 147.95168682,  59.05405241,
        71.62331856, 107.68284704, 165.45365458, 155.00975931,
       171.04799096,  61.45761356,  71.66672581, 114.96732206,
        51.57975523, 167.57599528, 152.52291955,  62.95568515,
       103.49741722, 109.20751489, 175.64118426, 154.60296242,
        94.41704366, 210.74209145, 120.2566205 ,  77.61585399,
       187.93203995, 206.49337474, 140.63167076, 105.59

In [62]:
# Compute R-squared score for the model on the test set
r2_test = r2_score(y_test, y_pred)

print("R-squared score on test set:" , r2_test)

R-squared score on test set: 0.4526027629719189
