# Scikit-learn Linear Regression Model
This notebook trains a linear regression model on newborn screening data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# Load Excel file
df = pd.read_excel("newborn.xlsx", sheet_name="source")

# Primary Cleaning
df.columns = df.columns.str.strip().str.lower().str.replace('\n', '_')
df = df.dropna(subset=['case_count', 'number_screened', 'region', 'disease'])

# Rename for convenience
df = df.rename(columns={
    'case_count': 'case_count',
    'number_screened': 'number_screened',
    'region': 'region',
    'disease': 'disease'
})

In [None]:
# Convert categorical features to dummy variables
df_dummies = pd.get_dummies(df, columns=['region', 'disease'], drop_first=True)

# Define X and y
X = df_dummies.drop('case_count', axis=1)
y = df_dummies['case_count']

# Ensure 2D shape for sklearn
X = X.values.reshape(-1, X.shape[1])
y = y.values.reshape(-1, 1)

In [None]:
# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

In [None]:
# Evaluation
print("R² score:", r2_score(y_test, y_pred))
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

In [None]:
# Optional: plot predicted vs actual
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual Case Count")
plt.ylabel("Predicted Case Count")
plt.title("Actual vs Predicted")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()