In [1]:
# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
# Import dataset from csv file
diabetes_df = pd.read_csv("diabetes_dirty.csv")
diabetes_df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,PROGRESSION
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [3]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AGE          442 non-null    int64  
 1   SEX          442 non-null    int64  
 2   BMI          442 non-null    float64
 3   BP           442 non-null    float64
 4   S1           442 non-null    int64  
 5   S2           442 non-null    float64
 6   S3           442 non-null    float64
 7   S4           442 non-null    float64
 8   S5           442 non-null    float64
 9   S6           442 non-null    int64  
 10  PROGRESSION  442 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


In [4]:
# Identify independent and dependent variables
X = diabetes_df.drop("PROGRESSION", axis = 1) 
Y = diabetes_df['PROGRESSION'] 

In [5]:
X.shape, Y.shape

((442, 10), (442,))

In [6]:
# Training and testing split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [7]:
# Initialise scalers
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Fit scalers on training data
min_max_scaler.fit(Xtrain)
standard_scaler.fit(Xtrain)

# Apply feature scaling
Xtrain_minmax_scaled = min_max_scaler.transform(Xtrain)
Xtest_minmax_scaled = min_max_scaler.transform(Xtest)

Xtrain_standard_scaled = standard_scaler.transform(Xtrain)
Xtest_standard_scaled = standard_scaler.transform(Xtest)

In [8]:
# Create an empty linear regression model
diabetes_model1 = LinearRegression()

# Fit on minmax scaled training data
diabetes_model1.fit(Xtrain_minmax_scaled, Ytrain)

In [9]:
# Create an empty linear regression model
diabetes_model2 = LinearRegression()

# Fit on standard scaled training data
diabetes_model2.fit(Xtrain_standard_scaled, Ytrain)

In [10]:
# Intercept and coefficients values for diabetes_model1
print('Intercept: \n', diabetes_model1.intercept_)
print('Coefficients: \n', diabetes_model1.coef_)

Intercept: 
 46.73897171076641
Coefficients: 
 [  -6.32774499  -23.17892315  109.1793695    68.3582411  -166.86817496
   75.23005763    6.37812692   35.19701916  146.7449625     9.62489064]


In [11]:
# Intercept and coefficients values for diabetes_model2
print('Intercept: \n', diabetes_model2.intercept_)
print('Coefficients: \n', diabetes_model2.coef_)

Intercept: 
 156.23389640586996
Coefficients: 
 [ -1.55852796 -11.32228679  23.52851308  15.11411951 -32.65123587
  15.97349396   1.21287304   7.75722292  30.33972216   1.7098051 ]


In [12]:
# Using both models to obtain predictions on test set
y_pred1 = diabetes_model1.predict(Xtest_minmax_scaled)
y_pred2 = diabetes_model2.predict(Xtest_standard_scaled)

In [13]:
#Compute the R2 squared score of the first model on the test set
R_sq = r2_score(Ytest, y_pred1)
print("R-squared is for diabetes_model1 ", round(R_sq*100), "%")

R-squared is for diabetes_model1  33 %


In [14]:
#Compute the R2 squared score of the second model on the test set
R_sq = r2_score(Ytest, y_pred2)
print("R-squared is for diabetes_model2 ", round(R_sq*100), "%")

R-squared is for diabetes_model2  33 %


In [None]:
#The R-squared score of 33% shows a limited amount of variance is explained by the models. 