In [20]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Read in data 
df = pd.read_csv("diabetes_updated.csv")

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
# Differentiate between independent and dependent variables

X = df.drop(columns=['Age'])  # Independent variables
y = df['Outcome']  # Dependent variable


In [22]:
# Generate training and test sets comprising 80% and 20% of the data respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to verify
# test size is 20% meaning the remaining 80% is the train set
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (614, 8)
Shape of X_test: (154, 8)
Shape of y_train: (614,)
Shape of y_test: (154,)


In [23]:
# Carefully analyse the types of features in this dataset. 
# Identify which features require us to use StandardScaler from sklearn.preprocessing. 
# Use StandardScaler on the appropriate features of the train set and test sets.

# Check the column names in the training set
print(X_train.columns)

# Identify the features that require scaling
features_to_scale = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform the appropriate features
X_train_scaled = X_train.copy()  # Create a copy of the training set
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform the corresponding features in the test set
X_test_scaled = X_test.copy()  # Create a copy of the test set
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Outcome'],
      dtype='object')


In [24]:
#  Generate a multiple linear regression model using the training set. Use all of the independent variables

# Initialize the linear regression model
regression_model = LinearRegression()

# Fit the model on the training set
regression_model.fit(X_train_scaled, y_train)

# Print the coefficients and intercept
print("Coefficients:", regression_model.coef_)
print("Intercept:", regression_model.intercept_)

Coefficients: [-4.17801656e-17  7.11236625e-17  1.88109077e-16  2.95987193e-17
 -3.46944695e-18 -2.03375999e-16  2.07960141e-16  1.00000000e+00]
Intercept: 1.6653345369377348e-16


In [25]:
#  Generate predictions for the test set. Compare the values used to make these predictions to the ones in the original diabetes dataset. 
# What needs to be done to make the interpretation of our predictions more meaningful?

# Generate predictions for the test set
y_pred = regression_model.predict(X_test_scaled)

# Create a DataFrame to compare predicted and actual values
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.head())


     Actual     Predicted
668       0 -8.496324e-16
324       0  7.540521e-17
624       0  2.416152e-17
690       0  2.421561e-16
473       0  1.763611e-16


In [26]:
#  Compute R-squared for your model on the test set. 
# You can use r2_score from sklearn.metrics to obtain this score

r_squared = r2_score(y_test, y_pred)
print("R-squared on test set:", r_squared)

R-squared on test set: 1.0
