<a href="https://colab.research.google.com/github/2303a52176yeruva/explainable-ai-lab-assignment/blob/main/lab_1_assignment_2176.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Dataset
data = {
    'Webinars_Attended': [0, 1, 2, 0, 1],
    'Sign_ups': [5, 15, 25, 8, 18]
}
df = pd.DataFrame(data)

# 1. Perform Linear Regression Analysis
X = df[['Webinars_Attended']]
y = df['Sign_ups']
model = LinearRegression()
model.fit(X, y)

# Coefficients
intercept = model.intercept_
slope = model.coef_[0]

# 2. Calculate Baseline Value (mean of y)
baseline = y.mean()

# 3. Calculate predicted sign-ups and SHAP values (Predicted - Baseline)
df['Predicted'] = model.predict(X)
df['SHAP'] = df['Predicted'] - baseline

# 4. Confirm Final Prediction = Baseline + SHAP
df['Check'] = baseline + df['SHAP']

# 5. Compare predicted vs actual sign-ups and classify over/under prediction
df['Difference'] = df['Predicted'] - df['Sign_ups']
df['Prediction_Type'] = df['Difference'].apply(
    lambda x: 'Over Prediction' if x > 0 else ('Under Prediction' if x < 0 else 'Exact'))

# Print model summary
print(f"Linear Regression Model: Sign-ups = {intercept:.2f} + {slope:.2f} * Webinars_Attended")
print(f"Baseline (mean Sign-ups): {baseline:.2f}\n")
print(df[['Webinars_Attended', 'Sign_ups', 'Predicted', 'SHAP', 'Difference', 'Prediction_Type']])

# Interpretation Summary
print("\nInterpretation:")
print(f"- The baseline sign-ups value (average) is {baseline:.2f}.")
print(f"- The slope coefficient {slope:.2f} indicates that each additional webinar attended is associated with an increase of about {slope:.2f} sign-ups.")
print("- SHAP values show how much the webinar attendance changes prediction from the baseline.")
print("- Comparing predicted and actual values tells us where the model over or under predicts.")

print("\nSummary Analysis:")
print("- The model captures a positive correlation between webinars attended and sign-ups.")
print("- Most predictions slightly under-predict actual sign-ups, except for some over-predictions at zero webinars.")
print("- The model provides reasonable estimates but could improve with more data for better accuracy.")


Linear Regression Model: Sign-ups = 6.71 + 9.36 * Webinars_Attended
Baseline (mean Sign-ups): 14.20

   Webinars_Attended  Sign_ups  Predicted       SHAP  Difference  \
0                  0         5   6.714286  -7.485714    1.714286   
1                  1        15  16.071429   1.871429    1.071429   
2                  2        25  25.428571  11.228571    0.428571   
3                  0         8   6.714286  -7.485714   -1.285714   
4                  1        18  16.071429   1.871429   -1.928571   

    Prediction_Type  
0   Over Prediction  
1   Over Prediction  
2   Over Prediction  
3  Under Prediction  
4  Under Prediction  

Interpretation:
- The baseline sign-ups value (average) is 14.20.
- The slope coefficient 9.36 indicates that each additional webinar attended is associated with an increase of about 9.36 sign-ups.
- SHAP values show how much the webinar attendance changes prediction from the baseline.
- Comparing predicted and actual values tells us where the model over 

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Dataset
data = {
    'Webinars': [3, 2, 1, 4, 2],
    'Blogs': [5, 3, 4, 2, 1],
    'Signups': [60, 45, 40, 55, 35]
}
df = pd.DataFrame(data)

# 1. Multiple Linear Regression
X = df[['Webinars', 'Blogs']]
y = df['Signups']
model = LinearRegression()
model.fit(X, y)

intercept = model.intercept_
coefficients = model.coef_

# 2. Baseline (mean of sign-ups)
baseline = y.mean()

# 3. Predictions and SHAP values (feature contribution)
df['Predicted'] = model.predict(X)
df['SHAP_Webinars'] = coefficients[0] * df['Webinars']
df['SHAP_Blogs'] = coefficients[1] * df['Blogs']

# Verify prediction reconstruction
df['Prediction_Check'] = intercept + df['SHAP_Webinars'] + df['SHAP_Blogs']

# 4. Compare predicted vs actual and classify prediction type
df['Difference'] = df['Predicted'] - df['Signups']
df['Prediction_Type'] = df['Difference'].apply(
    lambda x: 'Over Prediction' if x > 0 else ('Under Prediction' if x < 0 else 'Exact'))

# Results
print(f"Multiple Linear Regression Model:")
print(f"Sign-ups = {intercept:.2f} + {coefficients[0]:.2f} * Webinars + {coefficients[1]:.2f} * Blogs")
print(f"Baseline (mean Sign-ups): {baseline:.2f}\n")

print(df[['Webinars', 'Blogs', 'Signups', 'Predicted', 'SHAP_Webinars', 'SHAP_Blogs', 'Prediction_Check', 'Difference', 'Prediction_Type']])

print("\nInterpretation per record:")
for i, row in df.iterrows():
    print(f"Record {i+1}: Webinars SHAP={row['SHAP_Webinars']:.2f}, Blogs SHAP={row['SHAP_Blogs']:.2f}")
    print(f"  Predicted Signups: {row['Predicted']:.2f} vs Actual: {row['Signups']} -> {row['Prediction_Type']}")

print("\nSummary:")
print(f"- Intercept (baseline): {intercept:.2f}.")
print(f"- Webinars increase sign-ups by {coefficients[0]:.2f} per unit.")
print(f"- Blogs increase sign-ups by {coefficients[1]:.2f} per unit.")
print("- SHAP values show the additive contribution of each feature to the prediction relative to baseline.")
print("- Some over- and under-predictions occur, possibly due to data variability or linear model limitations.")


Multiple Linear Regression Model:
Sign-ups = 15.59 + 7.75 * Webinars + 4.27 * Blogs
Baseline (mean Sign-ups): 47.00

   Webinars  Blogs  Signups  Predicted  SHAP_Webinars  SHAP_Blogs  \
0         3      5       60  60.196078      23.235294   21.372549   
1         2      3       45  43.901961      15.490196   12.823529   
2         1      4       40  40.431373       7.745098   17.098039   
3         4      2       55  55.117647      30.980392    8.549020   
4         2      1       35  35.352941      15.490196    4.274510   

   Prediction_Check  Difference   Prediction_Type  
0         60.196078    0.196078   Over Prediction  
1         43.901961   -1.098039  Under Prediction  
2         40.431373    0.431373   Over Prediction  
3         55.117647    0.117647   Over Prediction  
4         35.352941    0.352941   Over Prediction  

Interpretation per record:
Record 1: Webinars SHAP=23.24, Blogs SHAP=21.37
  Predicted Signups: 60.20 vs Actual: 60 -> Over Prediction
Record 2: Webinars S

In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv("diabetes.csv")

target_column = 'Outcome'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

intercept = model.intercept_
coefficients = model.coef_
feature_names = X.columns
baseline = y_train.mean()

df_test = X_test.copy()
df_test['Actual_Outcome'] = y_test
df_test['Predicted_Outcome'] = model.predict(X_test)

# Calculate SHAP values relative to baseline
# SHAP(feature_i) = coefficient_i * (feature_i_value - mean(feature_i in training set))
# so SHAP sum + baseline = prediction
feature_means = X_train.mean()

for i, feature in enumerate(feature_names):
    df_test[f'SHAP_{feature}'] = coefficients[i] * (df_test[feature] - feature_means[feature])

shap_columns = [f'SHAP_{f}' for f in feature_names]
df_test['SHAP_Sum'] = df_test[shap_columns].sum(axis=1)

# Verify Prediction = Baseline + SHAP sum
df_test['Check_Prediction'] = baseline + df_test['SHAP_Sum']

df_test['Difference'] = df_test['Predicted_Outcome'] - df_test['Actual_Outcome']
df_test['Prediction_Type'] = df_test['Difference'].apply(
    lambda x: 'Over Prediction' if x > 0 else ('Under Prediction' if x < 0 else 'Exact')
)

print(f"Multiple Linear Regression Model:\nOutcome = {intercept:.2f} + sum(coefficients × features)")
for feat, coef in zip(feature_names, coefficients):
    print(f"  {feat}: {coef:.4f}")

print(f"\nBaseline (mean Outcome in training set): {baseline:.2f}\n")

output_cols = ['Actual_Outcome', 'Predicted_Outcome', 'Difference', 'Prediction_Type'] + shap_columns
print(df_test[output_cols].head())

print("\nInterpretation for first 3 test records:")
for i in range(3):
    print(f"\nRecord {i+1}:")
    print(f"  Actual: {df_test.iloc[i]['Actual_Outcome']:.2f}")
    print(f"  Predicted: {df_test.iloc[i]['Predicted_Outcome']:.2f} ({df_test.iloc[i]['Prediction_Type']})")
    print(f"  Baseline: {baseline:.2f}")
    print("  Feature contributions (SHAP values):")
    for feat in feature_names:
        print(f"    {feat}: {df_test.iloc[i][f'SHAP_{feat}']:.4f}")
    print(f"  Sum of SHAPs: {df_test.iloc[i]['SHAP_Sum']:.4f}")
    print(f"  Prediction Check (Baseline + SHAP sum): {df_test.iloc[i]['Check_Prediction']:.4f}")

print("\nSummary:")
print("- Baseline = average Outcome value in training set, serving as the baseline prediction.")
print("- SHAP values show how each feature pushes the prediction above or below this baseline for each patient.")
print("- The sum of SHAP values plus baseline matches the predicted value exactly.")
print("- Positive SHAP values increase predicted outcome; negative values decrease it.")
print("- Over- and under-predictions are identified and can be analyzed via SHAP contributions.")


Multiple Linear Regression Model:
Outcome = -0.95 + sum(coefficients × features)
  Pregnancies: 0.0105
  Glucose: 0.0056
  BloodPressure: -0.0023
  SkinThickness: 0.0005
  Insulin: -0.0003
  BMI: 0.0150
  DiabetesPedigreeFunction: 0.1113
  Age: 0.0065

Baseline (mean Outcome in training set): 0.35

     Actual_Outcome  Predicted_Outcome  Difference  Prediction_Type  \
668               0           0.335500    0.335500  Over Prediction   
324               0           0.238099    0.238099  Over Prediction   
624               0           0.151052    0.151052  Over Prediction   
690               0           0.240136    0.240136  Over Prediction   
473               0           0.481424    0.481424  Over Prediction   

     SHAP_Pregnancies  SHAP_Glucose  SHAP_BloodPressure  SHAP_SkinThickness  \
668          0.023630     -0.128755            0.026039            0.006705   
324         -0.018243     -0.049885           -0.012739            0.006173   
624         -0.018243     -0.072419 

In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Q4.1 Load and prepare data
df = pd.read_csv("StudentsPerformance.csv")
df['Final_Score'] = df[['math score','reading score','writing score']].mean(axis=1)
X = pd.get_dummies(df.drop(columns=['math score','reading score','writing score','Final_Score']), drop_first=True)
y = df['Final_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Q4.1 Fit Multiple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
coefficients = model.coef_
feature_names = X.columns

# Q4.2 Baseline value
baseline = y_train.mean()

# Q4.3 SHAP values (relative to baseline)
feature_means = X_train.mean()
df_test = X_test.copy()
df_test['Actual_Final_Score'] = y_test
df_test['Predicted_Final_Score'] = model.predict(X_test)
for i, feature in enumerate(feature_names):
    df_test[f"SHAP_{feature}"] = coefficients[i] * (df_test[feature] - feature_means[feature])
shap_cols = [f"SHAP_{f}" for f in feature_names]
df_test['SHAP_Sum'] = df_test[shap_cols].sum(axis=1)

# Q4.4 Verify prediction decomposition
df_test['Check_Prediction'] = baseline + df_test['SHAP_Sum']

# Q4.5 Compare predictions and classify
df_test['Difference'] = df_test['Predicted_Final_Score'] - df_test['Actual_Final_Score']
df_test['Prediction_Type'] = df_test['Difference'].apply(
    lambda x: 'Over Prediction' if x > 0 else ('Under Prediction' if x < 0 else 'Exact')
)

# Outputs
print(f"Baseline (Mean Final Score): {baseline:.2f}")
print("Coefficients:")
for f, c in zip(feature_names, coefficients):
    print(f"{f}: {c:.4f}")
print(df_test[['Actual_Final_Score','Predicted_Final_Score','Difference','Prediction_Type'] + shap_cols].head())

# Q4.5 Interpretation for first 3 students
for i in range(3):
    print(f"\nRecord {i+1}:")
    print(f"  Actual: {df_test.iloc[i]['Actual_Final_Score']:.2f}")
    print(f"  Predicted: {df_test.iloc[i]['Predicted_Final_Score']:.2f} ({df_test.iloc[i]['Prediction_Type']})")
    print(f"  Baseline: {baseline:.2f}")
    for feat in feature_names:
        print(f"    {feat}: {df_test.iloc[i][f'SHAP_{feat}']:.4f}")
    print(f"  Sum of SHAPs: {df_test.iloc[i]['SHAP_Sum']:.4f}")
    print(f"  Prediction Check: {df_test.iloc[i]['Check_Prediction']:.4f}")


Baseline (Mean Final Score): 68.17
Coefficients:
gender_male: -4.0919
race/ethnicity_group B: -0.1398
race/ethnicity_group C: 0.9179
race/ethnicity_group D: 3.7809
race/ethnicity_group E: 5.9602
parental level of education_bachelor's degree: 3.5021
parental level of education_high school: -4.6570
parental level of education_master's degree: 1.9284
parental level of education_some college: -0.8541
parental level of education_some high school: -3.2730
lunch_standard: 9.2077
test preparation course_none: -7.8777
     Actual_Final_Score  Predicted_Final_Score  Difference   Prediction_Type  \
521           87.000000              70.522773  -16.477227  Under Prediction   
737           64.000000              67.280948    3.280948   Over Prediction   
740           75.000000              72.795942   -2.204058  Under Prediction   
660           74.666667              56.369159  -18.297507  Under Prediction   
411           81.666667              78.496790   -3.169877  Under Prediction   

    