

### Import Libraries


In [32]:

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix


from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report


## Load the data

In [33]:

data = fetch_openml('mnist_784', version=1)

In [34]:

X = data.data
y = data.target.astype(int)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(data.feature_names)


X shape: (70000, 784)
y shape: (70000,)
['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel12', 'pixel13', 'pixel14', 'pixel15', 'pixel16', 'pixel17', 'pixel18', 'pixel19', 'pixel20', 'pixel21', 'pixel22', 'pixel23', 'pixel24', 'pixel25', 'pixel26', 'pixel27', 'pixel28', 'pixel29', 'pixel30', 'pixel31', 'pixel32', 'pixel33', 'pixel34', 'pixel35', 'pixel36', 'pixel37', 'pixel38', 'pixel39', 'pixel40', 'pixel41', 'pixel42', 'pixel43', 'pixel44', 'pixel45', 'pixel46', 'pixel47', 'pixel48', 'pixel49', 'pixel50', 'pixel51', 'pixel52', 'pixel53', 'pixel54', 'pixel55', 'pixel56', 'pixel57', 'pixel58', 'pixel59', 'pixel60', 'pixel61', 'pixel62', 'pixel63', 'pixel64', 'pixel65', 'pixel66', 'pixel67', 'pixel68', 'pixel69', 'pixel70', 'pixel71', 'pixel72', 'pixel73', 'pixel74', 'pixel75', 'pixel76', 'pixel77', 'pixel78', 'pixel79', 'pixel80', 'pixel81', 'pixel82', 'pixel83', 'pixel84', 'pixel85', 'pixel86', 'pixel87', 'pixel88', 

## Convert your labels y into a multi-label format, by creating two columns:

    - Odd or Not: This column indicates whether the label is odd (True) or not (False).
    - Greater Than 5: This column indicates whether the label is greater than 5 (True) or not (False).

## Odd Convert

In [35]:
y= data.target.astype(int)

is_odd = y % 2 == 1 
is_odd.head()


0     True
1    False
2    False
3     True
4     True
Name: class, dtype: bool

## Greater than 5

In [36]:
greater_5 = y > 5  
greater_5.head() 

0    False
1    False
2    False
3    False
4     True
Name: class, dtype: bool

You may use np.stack() to combine your columns

In [37]:

y_multilabel = np.stack([is_odd, greater_5], axis=1)
y_multilabel

array([[ True, False],
       [False, False],
       [False, False],
       ...,
       [False, False],
       [ True, False],
       [False,  True]])

In [38]:
# display if it odd or not or>5 with real value

label_df = pd.DataFrame(y_multilabel, columns=["Odd", " > 5"])
label_df[" vlaue "] = y
print(label_df.head(10))
print(f"\nMulti-label  shape: {y_multilabel.shape}")

     Odd    > 5   vlaue 
0   True  False        5
1  False  False        0
2  False  False        4
3   True  False        1
4   True   True        9
5  False  False        2
6   True  False        1
7   True  False        3
8   True  False        1
9  False  False        4

Multi-label  shape: (70000, 2)


## Build a model that predict two labels

#### split the data

In [39]:

X_train, X_test, y_train, y_test = train_test_split(X, y_multilabel,test_size=0.2,random_state=42)


### Model 1 — Logistic Regression (MultiOutput)


In [40]:

model_1 = MultiOutputClassifier(
    LogisticRegression(max_iter=200, solver="saga", n_jobs=-1)
)

model_1.fit(X_train, y_train)



In [41]:
Y_pred = model_1.predict(X_test)
Y_pred

array([[False,  True],
       [ True, False],
       [False,  True],
       ...,
       [False, False],
       [ True,  True],
       [ True, False]])

### Evaluation — Logistic Regression


In [42]:

print("model accuracy:", accuracy_score(y_test, Y_pred))
# Hamming loss: fraction of labels incorrectly predicted طبعا كل ماقل افضل 
ham_loss = hamming_loss(y_test, Y_pred)

print("F1 micro:", f1_score(y_test, Y_pred, average="micro"))
print("F1 macro:", f1_score(y_test, Y_pred, average="macro"))
print("Hamming loss:" ,ham_loss)

print("\nClassification Report:")
print(classification_report(y_test, Y_pred, zero_division=0))

model accuracy: 0.8092142857142857
F1 micro: 0.8784173222213568
F1 macro: 0.8752890831943758
Hamming loss: 0.1115

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      7229
           1       0.86      0.84      0.85      5676

   micro avg       0.88      0.87      0.88     12905
   macro avg       0.88      0.87      0.88     12905
weighted avg       0.88      0.87      0.88     12905
 samples avg       0.62      0.62      0.61     12905



In [43]:
confusion_matrix = multilabel_confusion_matrix(y_test, Y_pred)
confusion_matrix

array([[[6072,  699],
        [ 734, 6495]],

       [[7528,  796],
        [ 893, 4783]]])

In [44]:
label_names = ["Odd", " > 5"]

for i, name in enumerate(label_names):
    print(f"\nConfusion Matrix for {name}")
    print(confusion_matrix[i])


Confusion Matrix for Odd
[[6072  699]
 [ 734 6495]]

Confusion Matrix for  > 5
[[7528  796]
 [ 893 4783]]


### Train a Random Forest Classifier


In [45]:

# Train a RandomForestClassifier
model_rf= RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

In [46]:
Y_pred_rf = model_rf.predict(X_test)


### Evaluation — Random Forest

In [47]:

print("model accuracy:", accuracy_score(y_test, Y_pred_rf))
# Hamming loss: fraction of labels incorrectly predicted طبعا كل ماقل افضل 
ham_loss = hamming_loss(y_test, Y_pred_rf)

print("F1 micro:", f1_score(y_test, Y_pred_rf, average="micro"))
print("F1 macro:", f1_score(y_test, Y_pred_rf, average="macro"))
print("Hamming loss:" ,ham_loss)

print("\nClassification Report:")
print(classification_report(y_test, Y_pred_rf, zero_division=0))

model accuracy: 0.9614285714285714
F1 micro: 0.973516907835719
F1 macro: 0.9728086030223925
Hamming loss: 0.02425

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7229
           1       0.98      0.96      0.97      5676

   micro avg       0.98      0.97      0.97     12905
   macro avg       0.98      0.97      0.97     12905
weighted avg       0.98      0.97      0.97     12905
 samples avg       0.69      0.69      0.69     12905



In [48]:

conf_matrix_rf = multilabel_confusion_matrix(y_test, Y_pred_rf)

for i, name in enumerate(label_names):
    print(f"\nConfusion Matrix for {name}")
    print(conf_matrix_rf[i])


Confusion Matrix for Odd
[[6637  134]
 [ 176 7053]]

Confusion Matrix for  > 5
[[8204  120]
 [ 249 5427]]


### comparison table

In [49]:

results = pd.DataFrame([
    {
        "Model": "Logistic Regression (MultiOutput)",
        "Accuracy": accuracy_score(y_test, Y_pred),
        "F1 Micro": f1_score(y_test, Y_pred, average="micro"),
        "F1 Macro": f1_score(y_test, Y_pred, average="macro"),
        "Hamming Loss (lower better)": hamming_loss(y_test, Y_pred),
    },
    {
        "Model": "Random Forest",
        "Accuracy": accuracy_score(y_test, Y_pred_rf),
        "F1 Micro": f1_score(y_test, Y_pred_rf, average="micro"),
        "F1 Macro": f1_score(y_test, Y_pred_rf, average="macro"),
        "Hamming Loss (lower better)": hamming_loss(y_test, Y_pred_rf),
    }
])

results = results[["Model", "Accuracy", "F1 Micro", "F1 Macro", "Hamming Loss (lower better)"]]
results = results.round(4)

print(results)

                               Model  Accuracy  F1 Micro  F1 Macro  \
0  Logistic Regression (MultiOutput)    0.8092    0.8784    0.8753   
1                      Random Forest    0.9614    0.9735    0.9728   

   Hamming Loss (lower better)  
0                       0.1115  
1                       0.0242  


 Random Forest achieved higher accuracy and F1 scores  better  multilabel prediction performance

 The lower Hamming Loss  for Random Forest shows fewer label error


Random Forest is the stronger model for this multilabel MNIST task providing more accurate predictions with significantly fewer misclassified labels.