In [7]:
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("diabetes.tab.tsv", sep="\t")
df.rename(
    columns={
        "S1": "TC",
        "S2": "LDL",
        "S3": "HDL",
        "S4": "TCH",
        "S5": "LTG",
        "S6": "GLU",
    },
    inplace=True,
)

# Assuming that the feature and target columns need to be specified
X = df[["BMI"]]
y = df["Y"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, shuffle=False)

# Initialize and train the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)

# Calculate mean squared error and coefficient of determination
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error: {mse:.2f}")
print(f"Coefficient of determination: {r2:.2f}")

# Create DataFrames for the training and test sets with predictions
train_df = pd.DataFrame(
    {
        "BMI": X_train.squeeze(),
        "Disease Progression": y_train,
        "Set": "Train",
        "Prediction": regressor.predict(X_train),
    }
)

test_df = pd.DataFrame(
    {
        "BMI": X_test.squeeze(),  # Ensure it's a 1D array if necessary
        "Disease Progression": y_test,
        "Set": "Test",
        "Prediction": y_pred,
    }
)

# Combine the datasets
full_df = pd.concat([train_df, test_df])

# Base chart for actual data points
points = (
    alt.Chart(full_df)
    .mark_point()
    .encode(
        x=alt.X("BMI", title="BMI"),
        y=alt.Y("Disease Progression", title="Disease Progression"),
        color=alt.Color("Set", legend=alt.Legend(title="Dataset Type")),
        tooltip=["BMI", "Disease Progression", "Set"],
    )
)

# Line chart for predictions
predictions = (
    alt.Chart(full_df)
    .mark_line()
    .encode(
        x="BMI",
        y="Prediction",
        color=alt.Color("Set", legend=alt.Legend(title="Dataset Type")),
        detail="Set",
    )
)

# Combine the points and lines into one chart
chart = (
    alt.layer(points, predictions)
    .facet(column="Set:N")
    .properties(title="Linear Regression Analysis of Diabetes Progression Based on BMI")
)

# Display the chart
chart

Mean squared error: 2548.07
Coefficient of determination: 0.47


In [8]:
regressor.coef_

array([10.11244095])

In [9]:
regressor.score(X, y)

0.3437719044640575

In [26]:
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("diabetes.tab.tsv", sep="\t")
df.rename(
    columns={
        "S1": "TC",
        "S2": "LDL",
        "S3": "HDL",
        "S4": "TCH",
        "S5": "LTG",
        "S6": "GLU",
    },
    inplace=True,
)

# Select multiple features and handle missing values
features = ["AGE", "SEX", "BMI", "BP", "TC", "LDL", "HDL", "TCH", "LTG", "GLU"]
X = df[features].copy()  # Create a copy of the data to avoid SettingWithCopyWarning
y = df["Y"]  # Adjust the target column name if necessary

# Impute missing values with the mean of each column
X.fillna(X.mean(), inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=20, random_state=42, shuffle=False
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_test = regressor.predict(X_test_scaled)

# Calculate mean squared error and coefficient of determination
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Mean squared error: {mse:.2f}")
print(f"Coefficient of determination: {r2:.2f}")

# Prepare data for visualization
train_df = pd.DataFrame(
    {
        "BMI": X_train["BMI"].values,
        "Disease Progression": y_train,
        "Set": "Train",
        "Prediction": regressor.predict(X_train_scaled),
    }
)

test_df = pd.DataFrame(
    {
        "BMI": X_test["BMI"].values,
        "Disease Progression": y_test,
        "Set": "Test",
        "Prediction": y_pred_test,
    }
)

full_df = pd.concat([train_df, test_df])

# Create an Altair chart
base = alt.Chart(full_df).encode(
    x=alt.X("BMI", title="BMI"),
    y=alt.Y("Disease Progression", title="Disease Progression"),
    color=alt.Color("Set:N", legend=alt.Legend(title="Dataset Type")),
    tooltip=["BMI", "Disease Progression", "Set"],
)

points = base.mark_point()

# Adding regression lines
best_fit_line = base.transform_regression(
    "BMI", "Disease Progression", groupby=["Set"]
).mark_line()

chart = (
    alt.layer(points, best_fit_line)
    .facet(column="Set:N")
    .properties(title="Linear Regression Analysis of Diabetes Progression Based on BMI")
)

chart

Mean squared error: 2004.52
Coefficient of determination: 0.59
