In [160]:
import pandas as pd
data_path = "../data/penguins.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
3,,,,,0
4,36.7,19.3,193.0,3450.0,0


In [161]:
df.isna().sum()

CulmenLength     2
CulmenDepth      2
FlipperLength    2
BodyMass         2
Species          0
dtype: int64

In [162]:
df = df.dropna()

In [163]:
df.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
4,36.7,19.3,193.0,3450.0,0
5,39.3,20.6,190.0,3650.0,0


In [164]:
len(df)

342

In [165]:
# implement data normalisation

df['FlipperLength'] = df['FlipperLength'] / 10
df['BodyMass'] = df['BodyMass'] / 100
print(df.head())

   CulmenLength  CulmenDepth  FlipperLength  BodyMass  Species
0          39.1         18.7           18.1      37.5        0
1          39.5         17.4           18.6      38.0        0
2          40.3         18.0           19.5      32.5        0
4          36.7         19.3           19.3      34.5        0
5          39.3         20.6           19.0      36.5        0


In [166]:
from sklearn.model_selection import train_test_split

X, y = df[df.columns[:-1]].values, df['Species'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train[0], X_train.shape)
print(y_train[0], y_train.shape)

[36.4  17.   19.5  33.25] (239, 4)
0 (239,)


In [167]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, recall_score, classification_report

model = RandomForestClassifier().fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)
# metrics
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        35
           2       1.00      1.00      1.00        18

    accuracy                           1.00       103
   macro avg       1.00      1.00      1.00       103
weighted avg       1.00      1.00      1.00       103



In [168]:
# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

1.0


In [169]:
# save the model
import joblib
import os

# specify path of the model
model_path = '../models/penguins_model.pkl'
try:
    joblib.dump(model, model_path)
    print("model saved!")
except Exception as e:
    print(f"failed to save model {str(e)}")


model saved!


In [170]:
# load model 
loaded_model = joblib.load(model_path)
print(loaded_model)

RandomForestClassifier()


In [171]:
random_input = list(df.loc[100].values)
new_input = random_input[:-1]
new_input

[35.0, 17.9, 19.2, 37.25]

In [172]:
# inference with new input
penguin_classes = ['Adelie', 'Chinstrap', 'Gentoo']

result = model.predict([new_input])
print(f"the result is: {result[0]}, belonging to the class: {penguin_classes[result[0]]}")


the result is: 0, belonging to the class: Adelie


Implement Explainable AI in the project. The goal is to determine which are the more important features on a global scale, then individually determine which are the most significant features on each prediction that led to that specific result.

In [173]:
import numpy as np
import shap

features = ["culmen-length", "culmen-depth", "flipper-length", "body-mass"]
new_input = np.array([[43.5, 20.0, 21.7, 45.76]])

background = X_train[np.random.choice(X_train.shape[0], 100, replace=False)]

explainer  = shap.TreeExplainer(
    model,
    data=background,
    model_output = "probability",
    feature_perturbation="interventional",
    feature_names=features
    )
shap_values = explainer(new_input)

In [174]:
predicted_class = np.argmax(model.predict_proba(new_input))

shap.plots.force(
    explainer.expected_value[predicted_class],     # base value for that class
    shap_values.values[0, :, predicted_class],     # SHAP values for that class
    new_input[0],                                  # feature values
    feature_names=features
)


In [175]:
predicted_class = np.argmax(model.predict_proba(new_input))
values = shap_values.values[0, :, predicted_class]  # shape: [features]

# Pair with feature names
feature_importance = list(zip(features, values))

# Sort by absolute impact
sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

# Get top 2
top_2 = sorted_features[:2]
print("Top 2 features contributing to the prediction:")
for name, val in top_2:
    print(f"{name}: SHAP value = {val:.4f}")

Top 2 features contributing to the prediction:
flipper-length: SHAP value = -0.1923
culmen-depth: SHAP value = 0.1081


In [176]:
confidence = max(model.predict_proba(new_input)[0])
print(confidence)
predicted_class = np.argmax(model.predict_proba(new_input))
print(penguin_classes[predicted_class])

0.44
Adelie


In [None]:
print(sorted_features)

<class 'list'>
