# 1. Skill Boost – Coding Bootcamp

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import shap
df = pd.read_excel(r"/content/webinars.xlsx")
print(df.columns)
x=df[['Webinars Attended (x)']]
y=df['Sign-ups (y)']
model = LinearRegression()
model.fit(x, y)
slope = model.coef_[0]
intercept = model.intercept_
print(f"Slope: {slope:.2f}")
print(f"Intercept: {intercept:.2f}")
print(f"Equation of the linear model: y = {slope:.2f}x + {intercept:.2f}")
pred = model.predict(x)
baseline = np.mean(y)
print("Baseline :", baseline)

Index(['Webinars Attended (x)', 'Sign-ups (y)'], dtype='object')
Slope: 9.36
Intercept: 6.71
Equation of the linear model: y = 9.36x + 6.71
Baseline : 14.2


In [2]:
shap_values = pred - baseline
print("SHAP Value :", shap_values)

SHAP Value : [-7.48571429  1.87142857 11.22857143 -7.48571429  1.87142857]


In [3]:
check = pred == baseline + shap_values
print("Final Prediction:", baseline + shap_values)
print("Residuals:", y - pred)

Final Prediction: [ 6.71428571 16.07142857 25.42857143  6.71428571 16.07142857]
Residuals: 0   -1.714286
1   -1.071429
2   -0.428571
3    1.285714
4    1.928571
Name: Sign-ups (y), dtype: float64


In [8]:
Difference = pred - y
Prediction_Type = np.where(Difference > 0, 'Overpredicted', 'Underpredicted')

results = pd.DataFrame({
    'Webinars Attended (x)': x['Webinars Attended (x)'],
    'Actual_SignUps': y,
    'Predicted_SignUps': pred,
    'Baseline': baseline,
    'SHAP_Value': shap_values,
    'Final Prediction': baseline + shap_values,
    'Residuals': y - pred,
    'Difference': Difference,
    'Prediction_Type': Prediction_Type
})
print(results)

   Webinars Attended (x)  Actual_SignUps  Predicted_SignUps  Baseline  \
0                      0               5           6.714286      14.2   
1                      1              15          16.071429      14.2   
2                      2              25          25.428571      14.2   
3                      0               8           6.714286      14.2   
4                      1              18          16.071429      14.2   

   SHAP_Value  Final Prediction  Residuals  Difference Prediction_Type  
0   -7.485714          6.714286  -1.714286    1.714286   Overpredicted  
1    1.871429         16.071429  -1.071429    1.071429   Overpredicted  
2   11.228571         25.428571  -0.428571    0.428571   Overpredicted  
3   -7.485714          6.714286   1.285714   -1.285714  Underpredicted  
4    1.871429         16.071429   1.928571   -1.928571  Underpredicted  


# 2. FinTrack – Investment Signup Prediction using Multiple Linear Regression and SHAP Analysis

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import shap
data = pd.read_excel("/content/webinars-blogs.xlsx")
print(df.columns)
x=data[['Webinars (x1)','Blogs (x2)']]
y=data['Sign-ups (y)']
model = LinearRegression()
model.fit(x, y)
intercept = model.intercept_
coef_webinars, coef_blogs = model.coef_
print(f"Coefficient Webinars: {coef_webinars:.2f}")
print(f"Coefficient blogs: {coef_blogs:.2f}")
print(f"Intercept: {intercept:.2f}")
print(f"Equation of the linear model: y = {coef_webinars:.2f}x1 + {coef_blogs:.2f}x2 + {intercept:.2f}")
pred = model.predict(x)
baseline = np.mean(y)
print("Baseline :", baseline)

Index(['Webinars Attended (x)', 'Sign-ups (y)'], dtype='object')
Coefficient Webinars: 7.75
Coefficient blogs: 4.27
Intercept: 15.59
Equation of the linear model: y = 7.75x1 + 4.27x2 + 15.59
Baseline : 47.0


In [13]:
SHAP_Webinars = coef_webinars * (data['Webinars (x1)'] - x['Webinars (x1)'].mean())
SHAP_Blogs = coef_blogs * (data['Blogs (x2)'] - x['Blogs (x2)'].mean())
print(f"SHAP_Webinars: {SHAP_Webinars}")
print(f"SHAP_Blogs: {SHAP_Blogs}")

SHAP_Webinars: 0     4.647059
1    -3.098039
2   -10.843137
3    12.392157
4    -3.098039
Name: Webinars (x1), dtype: float64
SHAP_Blogs: 0    8.54902
1    0.00000
2    4.27451
3   -4.27451
4   -8.54902
Name: Blogs (x2), dtype: float64


In [15]:
Final_Prediction = baseline + SHAP_Webinars + SHAP_Blogs
print(f"Final_Prediction: {Final_Prediction}")

Final_Prediction: 0    60.196078
1    43.901961
2    40.431373
3    55.117647
4    35.352941
dtype: float64


In [16]:
Residuals = y - pred
print(f"Residuals: {Residuals}")

Residuals: 0   -0.196078
1    1.098039
2   -0.431373
3   -0.117647
4   -0.352941
Name: Sign-ups (y), dtype: float64


In [20]:
Difference = pred - y
Prediction_Type = np.where(Difference > 0, 'Overpredicted', 'Underpredicted')

results = pd.DataFrame({
    'Webinars (x1)': data['Webinars (x1)'],
    'Blogs (x2)': data['Blogs (x2)'],
    'Actual_SignUps': y,
    'Predicted_SignUps': pred,
    'baseline': baseline,
    'SHAP_Webinars': SHAP_Webinars,
    'SHAP_Blogs': SHAP_Blogs,
    'Final_Prediction': Final_Prediction,
    'Residuals': Residuals,
    'Difference': Difference,
    'Prediction_Type': Prediction_Type
})
print(results)

   Webinars (x1)  Blogs (x2)  Actual_SignUps  Predicted_SignUps  baseline  \
0              3           5              60          60.196078      47.0   
1              2           3              45          43.901961      47.0   
2              1           4              40          40.431373      47.0   
3              4           2              55          55.117647      47.0   
4              2           1              35          35.352941      47.0   

   SHAP_Webinars  SHAP_Blogs  Final_Prediction  Residuals  Difference  \
0       4.647059     8.54902         60.196078  -0.196078    0.196078   
1      -3.098039     0.00000         43.901961   1.098039   -1.098039   
2     -10.843137     4.27451         40.431373  -0.431373    0.431373   
3      12.392157    -4.27451         55.117647  -0.117647    0.117647   
4      -3.098039    -8.54902         35.352941  -0.352941    0.352941   

  Prediction_Type  
0   Overpredicted  
1  Underpredicted  
2   Overpredicted  
3   Overpredicted 

# 3. Regression with Diabetes Dataset

In [53]:
import pandas as pd
from sklearn.linear_model import LinearRegression
data = pd.DataFrame({
    'age':   [0.03, -0.02, 0.05, -0.08, 0.01, 0.02, -0.04, 0.07],
    'sex':   [0.02, -0.01, 0.03, -0.02, 0.01, 0.00, -0.03, 0.04],
    'bmi':   [0.06, -0.05, 0.04, -0.01, -0.03, 0.05, -0.02, 0.06],
    'bp':    [0.02, -0.03, -0.01, -0.04, 0.02, 0.01, -0.05, 0.03],
    's1':    [0.01, 0.02, -0.03, 0.04, -0.01, 0.02, 0.01, -0.02],
    's2':    [0.02, -0.04, 0.01, 0.03, 0.00, -0.01, 0.02, 0.01],
    's3':    [-0.01, 0.03, -0.02, 0.01, -0.03, 0.04, 0.00, 0.02],
    's4':    [0.05, -0.02, 0.01, 0.03, -0.04, 0.02, -0.01, 0.04],
    'DiseaseProgression': [151, 75, 141, 206, 135, 178, 120, 200]
})

X = data.drop(columns=['DiseaseProgression'])
y = data['DiseaseProgression']

model = LinearRegression()
model.fit(X, y)
intercept = model.intercept_
coefficients = model.coef_
feature_names = X.columns
baseline = y.mean()
print(f"Intercept (β₀): {intercept:.2f}")
print(f"Coefficients (β₁): {coefficients}")
print(f"Feature names: {feature_names}")
print(f"Baseline: {baseline}")


Intercept (β₀): 143.62
Coefficients (β₁): [-2392.21274169  2429.05317105  1323.01827638   943.82389566
 -1168.13424224  1297.33151619  1721.67099805 -1294.83652434]
Feature names: Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4'], dtype='object')
Baseline: 150.75


In [55]:
y_pred = model.predict(X)
X_mean = X.mean()
shap_values = pd.DataFrame(index=X.index, columns=feature_names)

for feature, coef in zip(feature_names, coefficients):
    shap_values[feature] = coef * (X[feature] - X_mean[feature])
    shap_values['Sum_SHAP'] = shap_values.sum(axis=1)
    shap_values['Baseline_plus_SHAP'] = baseline + shap_values['Sum_SHAP']
print(" SHAP values")
print(shap_values)


 SHAP values
          age        sex        bmi         bp         s1         s2  \
0  -59.805319  36.435798  62.843368  24.775377  -5.840671  19.459973   
1   59.805319 -36.435798 -82.688642 -22.415818 -17.522014 -58.379918   
2 -107.649573  60.726329  36.383003  -3.539340  40.884698   6.486658   
3  203.338083 -60.726329 -29.767911 -31.854056 -40.884698  32.433288   
4  -11.961064  12.145266 -56.228277  24.775377  17.522014  -6.486658   
5  -35.883191 -12.145266  49.613185  15.337138 -17.522014 -19.459973   
6  107.649573 -85.016861 -42.998094 -41.292295  -5.840671  19.459973   
7 -155.493828  85.016861  62.843368  34.213616  29.203356   6.486658   

          s3         s4      Sum_SHAP Baseline_plus_SHAP  
0 -25.825065 -51.793461  13168.750035       13319.500035  
1  43.041775  38.845096  23360.465919       23511.215919  
2 -43.041775  -0.000000   2118.813999        2269.563999  
3   8.608355 -25.896730  60035.054073       60185.804073  
4 -60.258485  64.741826  14964.672751      

In [58]:
Difference = y_pred - y
Prediction_Type = np.where(Difference > 0, 'Overpredicted', 'Underpredicted')

results = pd.DataFrame({
    'Actual': y,
    'Predicted': y_pred,
    'Baseline': baseline,
    'Sum_SHAP': shap_values['Sum_SHAP'],
    'Baseline_plus_SHAP': shap_values['Baseline_plus_SHAP'],
    'Difference': Difference,
    'Prediction_Type': Prediction_Type
})
print("Results")
print(results)

Results
   Actual  Predicted  Baseline      Sum_SHAP Baseline_plus_SHAP    Difference  \
0     151      151.0    150.75  13168.750035       13319.500035 -5.684342e-14   
1      75       75.0    150.75  23360.465919       23511.215919  2.557954e-13   
2     141      141.0    150.75   2118.813999        2269.563999  1.136868e-13   
3     206      206.0    150.75  60035.054073       60185.804073  3.126388e-13   
4     135      135.0    150.75  14964.672751       15115.422751 -8.526513e-14   
5     178      178.0    150.75  11822.446541       11973.196541 -4.263256e-13   
6     120      120.0    150.75  31858.490193       32009.240193 -2.415845e-13   
7     200      200.0    150.75   -4166.69351        -4015.94351  2.273737e-13   

  Prediction_Type  
0  Underpredicted  
1   Overpredicted  
2   Overpredicted  
3   Overpredicted  
4  Underpredicted  
5  Underpredicted  
6  Underpredicted  
7   Overpredicted  


# 4. Regression with Student Performance Dataset

In [68]:
import pandas as pd
from sklearn.linear_model import LinearRegression
data = pd.DataFrame({
    'study_time': [2, 1, 3, 4, 2, 3, 1, 4],
    'parent_edu': [3, 2, 4, 4, 3, 5, 2, 4],
    'absences':   [4, 10, 2, 0, 6, 3, 12, 1],
    'failures':   [0, 1, 0, 0, 1, 0, 2, 0],
    'health':     [4, 3, 5, 4, 2, 4, 3, 5],
    'G1':         [12, 10, 14, 15, 11, 14, 9, 15],
    'G2':         [13, 11, 15, 15, 12, 15, 10, 16],
    'FinalScore': [14, 12, 16, 17, 13, 16, 11, 18]
})

X = data.drop(columns=['FinalScore'])
y = data['FinalScore']
model = LinearRegression()
model.fit(X, y)

intercept = model.intercept_
coefficients = model.coef_
feature_names = X.columns.tolist()
baseline = y.mean()
print(f"Intercept (β₀): {intercept:.2f}")
print(f"Coefficients (β₁): {coefficients}")
print(f"Feature names: {feature_names}")
print(f"Baseline: {baseline}")


Intercept (β₀): 3.50
Coefficients (β₁): [ 1.7500000e+00 -2.5000000e-01  2.5000000e-01 -1.0000000e+00
  2.8449465e-16 -2.5000000e-01  7.5000000e-01]
Feature names: ['study_time', 'parent_edu', 'absences', 'failures', 'health', 'G1', 'G2']
Baseline: 14.625


In [62]:
y_pred = model.predict(X)
X_mean = X.mean()
shap_values = pd.DataFrame(index=X.index, columns=feature_names)

for feature, coef in zip(feature_names, coefficients):
    shap_values[feature] = coef * (X[feature] - X_mean[feature])

shap_values['Sum_SHAP'] = shap_values.sum(axis=1)
shap_values['Baseline_plus_SHAP'] = baseline + shap_values['Sum_SHAP']
print("\nSHAP values:")
print(shap_values)


SHAP values:
   study_time  parent_edu  absences  failures        health     G1       G2  \
0      -0.875     0.09375   -0.1875       0.5  7.112366e-17  0.125 -0.28125   
1      -2.625     0.34375    1.3125      -0.5 -2.133710e-16  0.625 -1.78125   
2       0.875    -0.15625   -0.6875       0.5  3.556183e-16 -0.375  1.21875   
3       2.625    -0.15625   -1.1875       0.5  7.112366e-17 -0.625  1.21875   
4      -0.875     0.09375    0.3125      -0.5 -4.978656e-16  0.375 -1.03125   
5       0.875    -0.40625   -0.4375       0.5  7.112366e-17 -0.375  1.21875   
6      -2.625     0.34375    1.8125      -1.5 -2.133710e-16  0.875 -2.53125   
7       2.625    -0.15625   -0.9375       0.5  3.556183e-16 -0.625  1.96875   

   Sum_SHAP  Baseline_plus_SHAP  
0    -0.625                14.0  
1    -2.625                12.0  
2     1.375                16.0  
3     2.375                17.0  
4    -1.625                13.0  
5     1.375                16.0  
6    -3.625                11.0  
7 

In [65]:
Difference = y_pred - y
Prediction_Type = np.where(Difference > 0, 'Overpredicted', 'Underpredicted')

results = pd.DataFrame({
    'Actual': y,
    'Predicted': y_pred,
    'Baseline': baseline,
    'Sum_SHAP': shap_values['Sum_SHAP'],
    'Baseline_plus_SHAP': shap_values['Baseline_plus_SHAP'],
    'Difference': Difference,
    'Prediction_Type': Prediction_Type
})
print("\nResults:")
print(results)



Results:
   Actual  Predicted  Baseline  Sum_SHAP  Baseline_plus_SHAP    Difference  \
0      14       14.0    14.625    -0.625                14.0  1.776357e-15   
1      12       12.0    14.625    -2.625                12.0  0.000000e+00   
2      16       16.0    14.625     1.375                16.0  0.000000e+00   
3      17       17.0    14.625     2.375                17.0  0.000000e+00   
4      13       13.0    14.625    -1.625                13.0  1.776357e-15   
5      16       16.0    14.625     1.375                16.0  0.000000e+00   
6      11       11.0    14.625    -3.625                11.0 -1.776357e-15   
7      18       18.0    14.625     3.375                18.0  0.000000e+00   

  Prediction_Type  
0   Overpredicted  
1  Underpredicted  
2  Underpredicted  
3  Underpredicted  
4   Overpredicted  
5  Underpredicted  
6  Underpredicted  
7  Underpredicted  
