In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
        "hours-per-week", "native-country", "income"]
data = pd.read_csv('/Users/swithana/git/icicle_model_card/examples/notebooks/data/adult/adult.data', names=columns)

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Drop rows with missing values
data = data.dropna()

# Encode categorical features using LabelEncoder
label_encoders = {}
for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split features and target variable
X = data.drop('income', axis=1)
y = data['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5514,33,2,198183,9,13,4,10,1,4,0,0,0,50,39
19777,36,4,86459,8,11,2,4,0,4,1,0,1887,50,39
10781,58,6,203039,6,5,5,3,1,4,1,0,0,40,39
32240,21,4,180190,8,11,2,5,0,4,1,0,0,46,39
9876,27,4,279872,15,10,0,8,1,4,1,0,0,40,39


In [6]:
# Initialize and train a SVC
clf = SVC(random_state=42)
clf.fit(X_train, y_train)

In [7]:
# Make predictions on the test set
predictions = clf.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.80
Classification Report:
               precision    recall  f1-score   support

           0       0.79      1.00      0.88      4942
           1       0.98      0.16      0.27      1571

    accuracy                           0.80      6513
   macro avg       0.89      0.58      0.58      6513
weighted avg       0.84      0.80      0.73      6513



## Model Card Generation

In [8]:
from icicle_model_card.icicle_model_card import ModelCard, AIModel, BiasAnalysis, ExplainabilityAnalysis, validate_mc, Metric, save_mc
import json

In [9]:
mc = ModelCard(
            name="UCI Adult Data Analysis via Random Forest",
            version="0.1",
            short_description="UCI Adult Data analysis using SKLearn and SVM",
            full_description="Using a Support Vector Machine to train on UCI Adult Data Analysis",
            keywords="uci adult, sklearn, svm, explainability, fairness, fairlearn, shap",
            author="Sachith Withana"
        )

mc.input_data = 'https://archive.ics.uci.edu/dataset/2/adult'
mc.output_data = 'https://github.iu.edu/swithana/mcwork/svm/adult_model.pkl'

In [10]:
model_metrics = [Metric("Test loss", 0.8)]

ai_model = AIModel(
            name="UCI Adult tensorflow model",
            version="0.1",
            description="Census classification problem using support vector machine",
            owner="Sachith Withana",
            location="https://github.iu.edu/swithana/mcwork/svm/adult_model.pkl",
            license="BSD-3 Clause",
            framework="sklearn",
            model_type="svm",
            test_accuracy=accuracy,
        )
ai_model.metrics = model_metrics
mc.ai_model = ai_model

In [11]:
mc.populate_bias(X_test, y_test, predictions, "gender", X_test['sex'], clf)

  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)
  mf = mf.applymap(lambda x: x if np.isscalar(x) else np.nan)


In [12]:
print(mc)

{
    "name": "UCI Adult Data Analysis via Random Forest",
    "version": "0.1",
    "short_description": "UCI Adult Data analysis using SKLearn and SVM",
    "full_description": "Using a Support Vector Machine to train on UCI Adult Data Analysis",
    "keywords": "uci adult, sklearn, svm, explainability, fairness, fairlearn, shap",
    "author": "Sachith Withana",
    "input_data": "https://archive.ics.uci.edu/dataset/2/adult",
    "output_data": "https://github.iu.edu/swithana/mcwork/svm/adult_model.pkl",
    "ai_model": {
        "name": "UCI Adult tensorflow model",
        "version": "0.1",
        "description": "Census classification problem using support vector machine",
        "owner": "Sachith Withana",
        "location": "https://github.iu.edu/swithana/mcwork/svm/adult_model.pkl",
        "license": "BSD-3 Clause",
        "framework": "sklearn",
        "model_type": "svm",
        "test_accuracy": 0.7957930293259634,
        "model_structure": "",
        "metrics": [
  

save_mc(mc, "/Users/swithana/git/icicle_model_card/examples/model_cards/tesorflow_adult_nn_MC.json")

In [13]:
save_mc(mc, "/Users/swithana/git/icicle_model_card/examples/model_cards/sklearn_adult_svm_MC.json")