In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
data = pd.DataFrame({
    'Newspaper_Ads': [1, 2, 3, 1, 2],
    'Orders_Received': [35, 48, 60, 40, 50]
})
X = data[['Newspaper_Ads']]
y = data['Orders_Received']
model = LinearRegression()
model.fit(X, y)

slope = model.coef_[0]
intercept = model.intercept_
baseline = y.mean()
data['Predicted'] = model.predict(X)
data['SHAP_Value'] = data['Predicted'] - baseline
data['Final_Prediction_Check'] = baseline + data['SHAP_Value']
data['Error'] = data['Predicted'] - data['Orders_Received']
data['Prediction_Quality'] = data['Error'].apply(
    lambda x: 'Overprediction' if x > 0 else 'Underprediction' if x < 0 else 'Exact'
)
print("\nLinear Regression Model:")
print(f"  Slope (coef): {slope:.2f}")
print(f"  Intercept: {intercept:.2f}")
print(f"  Baseline (mean of y): {baseline:.2f}\n")

print("Detailed Results:\n")
print(data)
from sklearn.metrics import r2_score

r2 = r2_score(y, data['Predicted'])
print(f"\nModel Accuracy (R² Score): {r2:.2f}")

print("\nSummary:")
print("- The number of newspaper ads has a strong linear influence on orders.")
print("- SHAP values show how much each ad count moved the prediction from the baseline.")
print("- Overpredictions and underpredictions are noted based on actual vs predicted.")


Linear Regression Model:
  Slope (coef): 11.29
  Intercept: 26.29
  Baseline (mean of y): 46.60

Detailed Results:

   Newspaper_Ads  Orders_Received  Predicted  SHAP_Value  \
0              1               35  37.571429   -9.028571   
1              2               48  48.857143    2.257143   
2              3               60  60.142857   13.542857   
3              1               40  37.571429   -9.028571   
4              2               50  48.857143    2.257143   

   Final_Prediction_Check     Error Prediction_Quality  
0               37.571429  2.571429     Overprediction  
1               48.857143  0.857143     Overprediction  
2               60.142857  0.142857     Overprediction  
3               37.571429 -2.428571    Underprediction  
4               48.857143 -1.142857    Underprediction  

Model Accuracy (R² Score): 0.96

Summary:
- The number of newspaper ads has a strong linear influence on orders.
- SHAP values show how much each ad count moved the prediction fro

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
data = pd.DataFrame({
    'Doctors_Available': [3, 2, 4, 1, 2],
    'Reminders_Sent': [1, 1, 0, 0, 1],
    'Appointments': [40, 35, 30, 20, 38]
})
X = data[['Doctors_Available', 'Reminders_Sent']]
y = data['Appointments']
model = LinearRegression()
model.fit(X, y)

coef_doctors = model.coef_[0]
coef_reminders = model.coef_[1]
intercept = model.intercept_
baseline = y.mean()
data['Predicted'] = model.predict(X)
data['SHAP_Doctors'] = (data['Doctors_Available'] * coef_doctors) - (X['Doctors_Available'].mean() * coef_doctors)
data['SHAP_Reminders'] = (data['Reminders_Sent'] * coef_reminders) - (X['Reminders_Sent'].mean() * coef_reminders)
data['SHAP_Total'] = data['SHAP_Doctors'] + data['SHAP_Reminders']
data['Final_Prediction_Check'] = baseline + data['SHAP_Total']
data['Error'] = data['Predicted'] - data['Appointments']
data['Prediction_Quality'] = data['Error'].apply(
    lambda x: 'Overprediction' if x > 0 else 'Underprediction' if x < 0 else 'Exact'
)
print("\nMultiple Linear Regression Coefficients:")
print(f"  Coefficient (Doctors_Available): {coef_doctors:.2f}")
print(f"  Coefficient (Reminders_Sent): {coef_reminders:.2f}")
print(f"  Intercept: {intercept:.2f}")
print(f"  Baseline (mean of Appointments): {baseline:.2f}")

print("\nDetailed Results:")
print(data)
from sklearn.metrics import r2_score

r2 = r2_score(y, data['Predicted'])
print(f"\nModel Accuracy (R² Score): {r2:.2f}")

print("\nSummary:")
print("- Doctors available and reminders both contribute positively to appointment bookings.")
print("- SHAP values help explain each feature’s contribution to deviation from the baseline.")
print("- Model errors indicate where predictions may miss due to unobserved variables (e.g., patient behavior).")


Multiple Linear Regression Coefficients:
  Coefficient (Doctors_Available): 3.35
  Coefficient (Reminders_Sent): 13.23
  Intercept: 16.61
  Baseline (mean of Appointments): 32.60

Detailed Results:
   Doctors_Available  Reminders_Sent  Appointments  Predicted  SHAP_Doctors  \
0                  3               1            40  39.903226      2.012903   
1                  2               1            35  36.548387     -1.341935   
2                  4               0            30  30.032258      5.367742   
3                  1               0            20  19.967742     -4.696774   
4                  2               1            38  36.548387     -1.341935   

   SHAP_Reminders  SHAP_Total  Final_Prediction_Check     Error  \
0        5.290323    7.303226               39.903226 -0.096774   
1        5.290323    3.948387               36.548387  1.548387   
2       -7.935484   -2.567742               30.032258  0.032258   
3       -7.935484  -12.632258               19.967742 -0.0

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
data = pd.DataFrame({
    'Doctors_Available': [3, 2, 4, 1, 2],
    'Reminders_Sent': [1, 1, 0, 0, 1],
    'Appointments': [40, 35, 30, 20, 38]
})
X = data[['Doctors_Available', 'Reminders_Sent']]
y = data['Appointments']
model = LinearRegression()
model.fit(X, y)

coef_doctors = model.coef_[0]
coef_reminders = model.coef_[1]
intercept = model.intercept_
baseline = y.mean()
data['Predicted'] = model.predict(X)
data['SHAP_Doctors'] = (data['Doctors_Available'] * coef_doctors) - (X['Doctors_Available'].mean() * coef_doctors)
data['SHAP_Reminders'] = (data['Reminders_Sent'] * coef_reminders) - (X['Reminders_Sent'].mean() * coef_reminders)

data['SHAP_Total'] = data['SHAP_Doctors'] + data['SHAP_Reminders']

data['Final_Prediction_Check'] = baseline + data['SHAP_Total']

data['Error'] = data['Predicted'] - data['Appointments']
data['Prediction_Quality'] = data['Error'].apply(
    lambda x: 'Overprediction' if x > 0 else 'Underprediction' if x < 0 else 'Exact'
)

print("\nMultiple Linear Regression Coefficients:")
print(f"  Coefficient (Doctors_Available): {coef_doctors:.2f}")
print(f"  Coefficient (Reminders_Sent): {coef_reminders:.2f}")
print(f"  Intercept: {intercept:.2f}")
print(f"  Baseline (mean of Appointments): {baseline:.2f}")

print("\nDetailed Results:")
print(data)
from sklearn.metrics import r2_score

r2 = r2_score(y, data['Predicted'])
print(f"\nModel Accuracy (R² Score): {r2:.2f}")

print("\nSummary:")
print("- Doctors available and reminders both contribute positively to appointment bookings.")
print("- SHAP values help explain each feature’s contribution to deviation from the baseline.")
print("- Model errors indicate where predictions may miss due to unobserved variables (e.g., patient behavior).")


Multiple Linear Regression Coefficients:
  Coefficient (Doctors_Available): 3.35
  Coefficient (Reminders_Sent): 13.23
  Intercept: 16.61
  Baseline (mean of Appointments): 32.60

Detailed Results:
   Doctors_Available  Reminders_Sent  Appointments  Predicted  SHAP_Doctors  \
0                  3               1            40  39.903226      2.012903   
1                  2               1            35  36.548387     -1.341935   
2                  4               0            30  30.032258      5.367742   
3                  1               0            20  19.967742     -4.696774   
4                  2               1            38  36.548387     -1.341935   

   SHAP_Reminders  SHAP_Total  Final_Prediction_Check     Error  \
0        5.290323    7.303226               39.903226 -0.096774   
1        5.290323    3.948387               36.548387  1.548387   
2       -7.935484   -2.567742               30.032258  0.032258   
3       -7.935484  -12.632258               19.967742 -0.0

In [4]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name='Disease_Progression')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
coefs = pd.Series(model.coef_, index=X.columns)
intercept = model.intercept_
baseline = y_train.mean()

predictions = model.predict(X_test)
feature_means = X_train.mean()
shap_values = (X_test - feature_means) * coefs
shap_total = shap_values.sum(axis=1)
final_prediction_check = baseline + shap_total

results = X_test.copy()
results['Actual'] = y_test.values
results['Predicted'] = predictions
results['Baseline'] = baseline
results['SHAP_Sum'] = shap_total
results['Prediction_Check'] = final_prediction_check
results['Error'] = results['Predicted'] - results['Actual']
results['Prediction_Quality'] = results['Error'].apply(
    lambda x: 'Overprediction' if x > 0 else 'Underprediction' if x < 0 else 'Exact'
)
print("\nModel Coefficients:")
print(coefs)

print(f"\nBaseline (mean of training target): {baseline:.2f}")

print("\nSample Predictions with SHAP breakdown:")
print(results[['Actual', 'Predicted', 'Baseline', 'SHAP_Sum', 'Prediction_Check', 'Error', 'Prediction_Quality']].head())
print("\nSHAP values for the first test patient:")
print(shap_values.iloc[0])


Model Coefficients:
age     47.749681
sex   -241.990907
bmi    531.971063
bp     381.562862
s1    -918.502905
s2     508.257783
s3     116.950164
s4     269.492303
s5     695.808117
s6      26.324582
dtype: float64

Baseline (mean of training target): 154.34

Sample Predictions with SHAP breakdown:
     Actual   Predicted    Baseline    SHAP_Sum  Prediction_Check       Error  \
287   219.0  137.949089  154.344411  -16.395322        137.949089  -81.050911   
211    70.0  182.533354  154.344411   28.188943        182.533354  112.533354   
72    202.0  129.852954  154.344411  -24.491457        129.852954  -72.147046   
321   230.0  292.563092  154.344411  138.218681        292.563092   62.563092   
73    111.0  124.867882  154.344411  -29.476529        124.867882   13.867882   

    Prediction_Quality  
287    Underprediction  
211     Overprediction  
72     Underprediction  
321     Overprediction  
73      Overprediction  

SHAP values for the first test patient:
age      2.116540
sex

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
data = pd.DataFrame({
    'study_time': [1, 2, 3, 4, 2, 3, 1, 4, 2, 3],
    'failures':   [3, 2, 1, 0, 1, 0, 3, 0, 1, 0],
    'absences':   [10, 4, 2, 0, 3, 1, 12, 0, 5, 2],
    'health':     [2, 4, 5, 5, 3, 4, 1, 5, 2, 3],
    'G3':         [8, 12, 15, 19, 14, 17, 6, 20, 13, 16]
})
X = data.drop(columns='G3')
y = data['G3']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
model = LinearRegression()
model.fit(X_train, y_train)
coefs = pd.Series(model.coef_, index=X.columns)
intercept = model.intercept_
baseline = y_train.mean()
y_pred = model.predict(X_test)
feature_means = X_train.mean()
shap_values = (X_test - feature_means) * coefs
shap_total = shap_values.sum(axis=1)
final_prediction_check = baseline + shap_total
results = X_test.copy()
results['Actual'] = y_test.values
results['Predicted'] = y_pred
results['Baseline'] = baseline
results['SHAP_Sum'] = shap_total
results['Prediction_Check'] = final_prediction_check
results['Error'] = results['Predicted'] - results['Actual']
results['Prediction_Quality'] = results['Error'].apply(
    lambda x: 'Overprediction' if x > 0 else 'Underprediction' if x < 0 else 'Exact'
)
print("\nModel Coefficients:")
print(coefs)

print(f"\nBaseline (mean G3 score): {baseline:.2f}")
print("\nSample Predictions:")
print(results[['Actual', 'Predicted', 'Baseline', 'SHAP_Sum', 'Prediction_Check', 'Error', 'Prediction_Quality']])

print("\nSHAP breakdown for first test student:")
print(shap_values.iloc[0].sort_values(ascending=False))


Model Coefficients:
study_time    2.205128
failures     -0.948718
absences     -0.256410
health       -0.153846
dtype: float64

Baseline (mean G3 score): 14.71

Sample Predictions:
   Actual  Predicted   Baseline  SHAP_Sum  Prediction_Check     Error  \
2      15  15.769231  14.714286  1.054945         15.769231  0.769231   
9      16  17.025641  14.714286  2.311355         17.025641  1.025641   
6       6   7.512821  14.714286 -7.201465          7.512821  1.512821   

  Prediction_Quality  
2     Overprediction  
9     Overprediction  
6     Overprediction  

SHAP breakdown for first test student:
study_time    0.945055
absences      0.329670
failures     -0.000000
health       -0.219780
Name: 2, dtype: float64
