In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('diabetes_updated.csv')
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


I believe the indepdent variables are as follows:
- Pregnancies
- Glucose
- Blood Pressure
- Skin Thickness
- Insulin
- BMI
- Diabetes Pedigree Function
- Age

The dependent variable is the 'Outcome'

In [3]:
# Independent variables assigned to X
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

# Dependent variable assigned to y
y = df['Outcome']

# Display the top rows 
print(X.head())
print(y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

# Independent variables assigned to X
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

# Dependent variable assigned to y
y = df['Outcome']

# Split data into training (80%) and test sets (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of training and test sets 
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (614, 8)
X_test shape: (154, 8)
y_train shape: (614,)
y_test shape: (154,)


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on the training data (excluding 'Pregnancies')
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Features to scale
features_to_scale = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Fit and transform the training data
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform the test data
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

# Display the first rows of the scaled training set 
print(X_train_scaled.head())
print(X_test_scaled.head())

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
60             2 -1.151398      -3.752683      -1.322774 -0.701206 -4.135256   
618            9 -0.276643       0.680345       0.233505 -0.701206 -0.489169   
346            1  0.566871      -1.265862      -0.090720  0.013448 -0.424522   
294            0  1.254179      -1.049617      -1.322774 -0.701206 -1.303720   
231            6  0.410665       0.572222       1.076490  2.484601  1.838121   

     DiabetesPedigreeFunction       Age  
60                  -0.490735 -1.035940  
618                  2.415030  1.487101  
346                  0.549161 -0.948939  
294                 -0.639291  2.792122  
231                 -0.686829  1.139095  
     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
668            6 -0.714020      -0.617127       0.817110  0.934749  0.260736   
324            2 -0.276643       0.301916       0.752265 -0.701206  0.480535   
624            2 -0.401608 

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train_scaled)

# Make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Printing evaluation metrics
print(f"Training set - Mean Squared Error: {mse_train}, R-squared: {r2_train}")
print(f"Test set - Mean Squared Error: {mse_test}, R-squared: {r2_test}")

# Printing the intercept and coefficients
print(f"Intercept: {model.intercept_}")
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")

Training set - Mean Squared Error: 0.15744485172625472, R-squared: 0.30506972801106247
Test set - Mean Squared Error: 0.17104527280850104, R-squared: 0.25500281176741746
Intercept: 0.3077265865771336
Coefficients:
Pregnancies: 0.010468179217423853
Glucose: 0.18032340480404005
BloodPressure: -0.04219338933075192
SkinThickness: 0.008205625692770401
Insulin: -0.03230381473499022
BMI: 0.11631364474886582
DiabetesPedigreeFunction: 0.03744792774157054
Age: 0.07425472985187122
