<a href="https://colab.research.google.com/github/2303A52381/Explainable--AI-Lab/blob/main/XAI_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Question 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

flyers = np.array([1, 2, 3, 1, 2]).reshape(-1, 1)
cars_washed = np.array([12, 22, 29, 14, 24])

model = LinearRegression()
model.fit(flyers, cars_washed)

coef = model.coef_[0]
intercept = model.intercept_

baseline = np.mean(cars_washed)

predictions = model.predict(flyers)
shap_values = predictions - baseline

verification = np.isclose(predictions, baseline + shap_values)

comparison = ["Over" if p > a else "Under" if p < a else "Exact"
              for p, a in zip(predictions, cars_washed)]

df_results = pd.DataFrame({
    "Flyers (100s)": flyers.flatten(),
    "Actual Cars": cars_washed,
    "Predicted Cars": predictions.round(2),
    "Baseline": baseline,
    "SHAP Value": shap_values.round(2),
    "Verification": verification,
    "Prediction Type": comparison
})

print("\n Linear Regression Implementation with Coefficients ")
print(f"Coefficient: {coef:.2f}, Intercept: {intercept:.2f}")

print("\n Baseline (Mean of y) ")
print(f"Baseline Value: {baseline:.2f}")

print("\n Table of SHAP values and Predictions ")
print(df_results)

print("\n Explanation of Input Influence ")
for i, (x, shap, pred, actual) in enumerate(zip(flyers.flatten(), shap_values, predictions, cars_washed)):
    influence = "increased" if shap > 0 else "decreased"
    print(f"Record {i+1}: Flyers={x} {influence} prediction by {abs(shap):.2f} cars. "
          f"Predicted={pred:.2f}, Actual={actual}, Prediction Type={comparison[i]}")

print("\n Model Accuracy ")
r2 = r2_score(cars_washed, predictions)
mae = mean_absolute_error(cars_washed, predictions)
rmse = np.sqrt(mean_squared_error(cars_washed, predictions))
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

print("\n Trend Analysis ")
if coef > 0:
    print("As the number of flyers increases, the number of cars washed also increases.")
else:
    print("Increasing flyers does not positively impact the number of cars washed.")

print("\n SHAP Interpretation Insights ")
print("SHAP values show how each flyer count influenced predictions compared to the baseline.")
print("Positive SHAP values indicate more flyers than average led to higher predicted washes,")
print("while negative values indicate fewer flyers led to lower predicted washes.")


 Linear Regression Implementation with Coefficients 
Coefficient: 8.29, Intercept: 5.29

 Baseline (Mean of y) 
Baseline Value: 20.20

 Table of SHAP values and Predictions 
   Flyers (100s)  Actual Cars  Predicted Cars  Baseline  SHAP Value  \
0              1           12           13.57      20.2       -6.63   
1              2           22           21.86      20.2        1.66   
2              3           29           30.14      20.2        9.94   
3              1           14           13.57      20.2       -6.63   
4              2           24           21.86      20.2        1.66   

   Verification Prediction Type  
0          True            Over  
1          True           Under  
2          True            Over  
3          True           Under  
4          True           Under  

 Explanation of Input Influence 
Record 1: Flyers=1 decreased prediction by 6.63 cars. Predicted=13.57, Actual=12, Prediction Type=Over
Record 2: Flyers=2 increased prediction by 1.66 cars. Pre

Question 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

chargers = np.array([5, 3, 4, 2, 5])
peak_hour = np.array([1, 0, 1, 0, 0])
sessions = np.array([80, 40, 70, 30, 60])

X = np.column_stack((chargers, peak_hour))
model = LinearRegression()
model.fit(X, sessions)

coef_chargers, coef_peak = model.coef_
intercept = model.intercept_

baseline = np.mean(sessions)

shap_chargers = coef_chargers * (chargers - np.mean(chargers))
shap_peak = coef_peak * (peak_hour - np.mean(peak_hour))
total_shap = shap_chargers + shap_peak

predictions = model.predict(X)
verification = np.isclose(predictions, baseline + total_shap)

comparison = ["Over" if p > a else "Under" if p < a else "Exact"
              for p, a in zip(predictions, sessions)]

df_results = pd.DataFrame({
    "Chargers": chargers,
    "Peak Hour": peak_hour,
    "Actual Sessions": sessions,
    "Predicted Sessions": predictions.round(2),
    "Baseline": baseline,
    "SHAP Chargers": shap_chargers.round(2),
    "SHAP PeakHour": shap_peak.round(2),
    "Total SHAP": total_shap.round(2),
    "Verification": verification,
    "Prediction Type": comparison
})

print("\n=== Multiple Linear Regression Implementation ===")
print(f"Coefficients: Chargers={coef_chargers:.2f}, Peak Hour={coef_peak:.2f}")
print(f"Intercept: {intercept:.2f}")

print("\n=== Baseline (Mean of Sessions) ===")
print(f"Baseline Value: {baseline:.2f}")

print("\n=== Table of SHAP Values and Predictions ===")
print(df_results)

print("\n=== Explanation of Feature Influence ===")
for i, (c, p, shap_c, shap_p, pred, actual, pred_type) in enumerate(
    zip(chargers, peak_hour, shap_chargers, shap_peak, predictions, sessions, comparison), 1):
    contrib_c = "increased" if shap_c > 0 else "decreased"
    contrib_p = "increased" if shap_p > 0 else "decreased"
    print(f"Record {i}: Chargers={c} ({contrib_c} prediction by {abs(shap_c):.2f}), "
          f"Peak Hour={p} ({contrib_p} prediction by {abs(shap_p):.2f}). "
          f"Predicted={pred:.2f}, Actual={actual}, Prediction Type={pred_type}")

print("\n=== Model Accuracy ===")
r2 = r2_score(sessions, predictions)
mae = mean_absolute_error(sessions, predictions)
rmse = np.sqrt(mean_squared_error(sessions, predictions))
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

print("\n=== Trend Analysis ===")
if coef_chargers > 0:
    print("More chargers generally increase the number of charging sessions.")
else:
    print("More chargers do not positively impact charging sessions.")
if coef_peak > 0:
    print("Peak hour usage increases the predicted number of charging sessions.")
else:
    print("Peak hour usage decreases the predicted number of charging sessions.")

print("\n=== SHAP Interpretation Insights ===")
print("SHAP values show how much each feature (Chargers, Peak Hour) contributed to predictions compared to baseline.")
print("Positive SHAP values indicate a feature increased the prediction, negative values indicate a decrease.")


=== Multiple Linear Regression Implementation ===
Coefficients: Chargers=10.00, Peak Hour=20.00
Intercept: 10.00

=== Baseline (Mean of Sessions) ===
Baseline Value: 56.00

=== Table of SHAP Values and Predictions ===
   Chargers  Peak Hour  Actual Sessions  Predicted Sessions  Baseline  \
0         5          1               80                80.0      56.0   
1         3          0               40                40.0      56.0   
2         4          1               70                70.0      56.0   
3         2          0               30                30.0      56.0   
4         5          0               60                60.0      56.0   

   SHAP Chargers  SHAP PeakHour  Total SHAP  Verification Prediction Type  
0           12.0           12.0        24.0          True           Exact  
1           -8.0           -8.0       -16.0          True           Exact  
2            2.0           12.0        14.0          True           Exact  
3          -18.0           -8.0       

Question 3

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv("diabetes.csv")

X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = model.coef_
intercept = model.intercept_
feature_names = X.columns

baseline = np.mean(y_train)

y_pred = model.predict(X_test)
shap_values = pd.DataFrame(index=X_test.index, columns=feature_names)

for feature in feature_names:
    shap_values[feature] = model.coef_[list(feature_names).index(feature)] * (X_test[feature] - np.mean(X_train[feature]))

total_shap = shap_values.sum(axis=1)
verification = np.isclose(y_pred, baseline + total_shap)

comparison = ["Over" if p > a else "Under" if p < a else "Exact"
              for p, a in zip(y_pred, y_test)]

df_results = X_test.copy()
df_results["Actual Outcome"] = y_test
df_results["Predicted Outcome"] = y_pred.round(4)
df_results["Baseline"] = baseline
for feature in feature_names:
    df_results[f"SHAP {feature}"] = shap_values[feature].round(4)
df_results["Total SHAP"] = total_shap.round(4)
df_results["Verification"] = verification
df_results["Prediction Type"] = comparison

print("Multiple Linear Regression Implementation")
print(f"Intercept: {intercept:.4f}")
for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef:.4f}")

print("Baseline Value")
print(f"Baseline: {baseline:.4f}")

print("SHAP Values and Predictions Table")
print(df_results.head())

print("Explanation of Feature Influence")
for idx in df_results.index[:5]:
    print(f"Record {idx}:")
    for feature in feature_names:
        contrib = "increased" if df_results.loc[idx, f"SHAP {feature}"] > 0 else "decreased"
        print(f"  {feature} {contrib} prediction by {abs(df_results.loc[idx, f'SHAP {feature}']):.4f}")
    print(f"Predicted={df_results.loc[idx, 'Predicted Outcome']}, "
          f"Actual={df_results.loc[idx, 'Actual Outcome']}, "
          f"Type={df_results.loc[idx, 'Prediction Type']}")

print("Model Accuracy")
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

print("SHAP Interpretation Insights")
print("SHAP values represent how much each feature contributed to the prediction relative to the baseline.")
print("Positive values indicate an increase in predicted disease progression score, negative values indicate a decrease.")


Multiple Linear Regression Implementation
Intercept: -0.9488
Pregnancies: 0.0105
Glucose: 0.0056
BloodPressure: -0.0023
SkinThickness: 0.0005
Insulin: -0.0003
BMI: 0.0150
DiabetesPedigreeFunction: 0.1113
Age: 0.0065
Baseline Value
Baseline: 0.3469
SHAP Values and Predictions Table
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
668            6       98             58             33      190  34.0   
324            2      112             75             32        0  35.7   
624            2      108             64              0        0  30.8   
690            8      107             80              0        0  24.6   
473            7      136             90              0        0  29.9   

     DiabetesPedigreeFunction  Age  Actual Outcome  Predicted Outcome  ...  \
668                     0.430   43               0             0.3355  ...   
324                     0.148   21               0             0.2381  ...   
624                     0.158   21     

Question 4


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv("Student_Performance.csv")

df["Extracurricular Activities"] = df["Extracurricular Activities"].map({"Yes": 1, "No": 0})

X = df.drop(columns=["Performance Index"])
y = df["Performance Index"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

intercept = model.intercept_
coefficients = pd.Series(model.coef_, index=X.columns)

baseline = np.mean(y_train)

y_pred = model.predict(X_test)

shap_df = pd.DataFrame(index=X_test.index, columns=X.columns, dtype=float)
for feature in X.columns:
    shap_df[feature] = model.coef_[list(X.columns).index(feature)] * (X_test[feature] - X_train[feature].mean())

total_shap = shap_df.sum(axis=1)
verification = np.isclose(y_pred, baseline + total_shap)

over_under = ["Over" if p > a else "Under" if p < a else "Exact" for p, a in zip(y_pred, y_test)]

df_results = X_test.copy()
df_results["Actual Score"] = y_test
df_results["Predicted Score"] = np.round(y_pred, 2)
df_results["Baseline"] = baseline
for col in X.columns:
    df_results[f"SHAP {col}"] = np.round(shap_df[col], 4)
df_results["Total SHAP"] = np.round(total_shap, 4)
df_results["Verification (Baseline + SHAPs)"] = np.round(baseline + total_shap, 4)
df_results["Over/Under"] = over_under

print("Multiple Linear Regression Implementation")
print("Intercept:", float(intercept))
print("Coefficients:")
for name, val in coefficients.items():
    print(f"  {name}: {val}")

print("Baseline (mean of training final scores)")
print("Baseline:", float(baseline))

print("SHAP values and predictions (first 10 rows)")
print(df_results.head(10).to_string())

print("Explanation of contributions for each test record (first 10 rows)")
for idx in df_results.index[:10]:
    print(f"Record index {idx}:")
    for col in X.columns:
        val = df_results.loc[idx, f"SHAP {col}"]
        direction = "increased" if val > 0 else "decreased" if val < 0 else "no change"
        print(f"  {col}: {direction} prediction by {abs(val):.4f}")
    print(f"  Predicted Score: {df_results.loc[idx, 'Predicted Score']}, Actual Score: {df_results.loc[idx, 'Actual Score']}, {df_results.loc[idx, 'Over/Under']} prediction")

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model accuracy on test set")
print("R2 score:", r2)
print("MAE:", mae)
print("RMSE:", rmse)


Multiple Linear Regression Implementation
Intercept: -33.836543807141744
Coefficients:
  Hours Studied: 2.8589522305235064
  Previous Scores: 1.0155197942880816
  Extracurricular Activities: 0.5817128541736178
  Sleep Hours: 0.47967675981703317
  Sample Question Papers Practiced: 0.19039415655051053
Baseline (mean of training final scores)
Baseline: 55.39971428571429
SHAP values and predictions (first 10 rows)
      Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  Sample Question Papers Practiced  Actual Score  Predicted Score   Baseline  SHAP Hours Studied  SHAP Previous Scores  SHAP Extracurricular Activities  SHAP Sleep Hours  SHAP Sample Question Papers Practiced  Total SHAP  Verification (Baseline + SHAPs) Over/Under
6252              5               69                           0            8                                 2          51.0            54.75  55.399714              0.0441               -0.6119                          -0.2852            0.69