In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [23]:
data = pd.read_csv('/Users/swithana/git/d2i/patra-toolkit/examples/notebooks/data/adult/train.csv')

In [24]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [25]:
# Drop irrelevant columns
data_cleaned = data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [26]:
# Fill missing values in 'Age' with the median and 'Embarked' with the mode
data_cleaned['Age'] = data_cleaned['Age'].fillna(data_cleaned['Age'].median())
data_cleaned['Embarked'] = data_cleaned['Embarked'].fillna(data_cleaned['Embarked'].mode()[0])


In [27]:
# Encode categorical variables 'Sex' and 'Embarked'
label_encoder_sex = LabelEncoder()
label_encoder_embarked = LabelEncoder()

In [28]:
data_cleaned['Sex'] = label_encoder_sex.fit_transform(data_cleaned['Sex'])
data_cleaned['Embarked'] = label_encoder_embarked.fit_transform(data_cleaned['Embarked'])

In [29]:
# Separate features (X) and target variable (y)
X = data_cleaned.drop(columns=['Survived'])
y = data_cleaned['Survived']


In [30]:
data_cleaned.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [31]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [33]:
# Make predictions on the test set
predictions = clf.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.82
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [34]:
from patra_model_card.patra_model_card import ModelCard, AIModel, BiasAnalysis, ExplainabilityAnalysis, Metric

In [35]:
mc = ModelCard(
    name="Titanic Disaster Analysis",
    version="0.1",
    short_description="Titanic Disaster Analysis using SKLearn and Random Forest for demonstration of Patra Model Cards.",
    full_description="We have trained a ML model using SKLearn and Random Forest to predict profit for Titanic Disaster Analysis dataset. We leverage this data to run the Patra model cards to capture metadata about the model as well as fairness and explainability metrics.",
    keywords="titanic, sklearn, random_forest, explainability, fairness, patra",
    author="Isuru Gamage",
    input_type="Tabular",
    category="classification"
)

mc.input_data = 'https://www.kaggle.com/datasets/monisamir/titanic-disaster-analysis'
mc.output_data = 'https://github.iu.edu/d2i/dockerhub/tensorflow/titanic_modelv01'

In [36]:
model_metrics = [Metric("Test loss", 0.7)]

ai_model = AIModel(
    name="Survived prediction tensorflow model",
    version="0.1",
    description="Titanic Disaster Analysis using SKLearn and Random Forest",
    owner="Isuru Gamage",
    location="https://github.iu.edu/d2i/sales/tensorflow_model",
    license="BSD-3 Clause",
    framework="sklearn",
    model_type="random_forest",
    foundational_model="None",
    test_accuracy=2
)
ai_model.metrics = model_metrics
mc.ai_model = ai_model

In [37]:
mc.populate_bias(X_test, y_test, predictions, "Sex", X_test['Sex'], clf)

In [38]:
x_columns = data_cleaned.columns.tolist()
x_columns.remove('Survived')

mc.populate_xai(X_test[:10], x_columns, clf, 10)

(10, 7)


In [39]:
print(mc)

{
    "name": "Titanic Disaster Analysis",
    "version": "0.1",
    "short_description": "Titanic Disaster Analysis using Tensorflow for demonstration of Patra Model Cards.",
    "full_description": "We have trained a ML model using the tensorflow framework to predict profit for Titanic Disaster Analysis dataset. We leverage this data to run the Patra model cards to capture metadata about the model as well as fairness and explainability metrics.",
    "keywords": "titanic, tensorflow, explainability, fairness, patra",
    "author": "Isuru Gamage",
    "input_type": "Tabular",
    "category": "classification",
    "input_data": "https://www.kaggle.com/datasets/monisamir/titanic-disaster-analysis",
    "output_data": "https://github.iu.edu/d2i/dockerhub/tensorflow/titanic_modelv01",
    "ai_model": {
        "name": "Survived prediction tensorflow model",
        "version": "0.1",
        "description": "Census classification problem using Tensorflow Neural Network using the Titanic Dis

In [40]:
mc.validate()

True

In [41]:
mc.save("/Users/swithana/git/d2i/patra-toolkit/examples/model_cards/RF_sklearn_titanic_MC.json")