In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

# For MNIST Dataset

In [2]:
mnist = fetch_openml('mnist_784')
X = mnist.data.astype('float32') / 255.0
y = mnist.target.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, rf_classifier.predict(X_train))
test_accuracy = accuracy_score(y_test, rf_classifier.predict(X_test))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

  warn(


Train Accuracy: 1.0
Test Accuracy: 0.9675


In [3]:
from sklearn.metrics import f1_score

mnist_f1_score_train = f1_score(y_train, rf_classifier.predict(X_train), average='weighted')
mnist_f1_score_test = f1_score(y_test, rf_classifier.predict(X_test), average='weighted')

print("MNIST F1 Score (Train):", mnist_f1_score_train)
print("MNIST F1 Score (Test):", mnist_f1_score_test)

MNIST F1 Score (Train): 1.0
MNIST F1 Score (Test): 0.9674871124026196


# For cifar10 dataset

In [4]:
from sklearn.datasets import fetch_openml

cifar10 = fetch_openml('CIFAR_10_small')

X = cifar10.data.astype('float32') / 255.0  # Scale pixel values to [0, 1]
y = cifar10.target.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, rf_classifier.predict(X_train))
test_accuracy = accuracy_score(y_test, rf_classifier.predict(X_test))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

  warn(


Train Accuracy: 1.0
Test Accuracy: 0.4355


In [5]:
cifar10_f1_score_train = f1_score(y_train, rf_classifier.predict(X_train), average='weighted')
cifar10_f1_score_test = f1_score(y_test, rf_classifier.predict(X_test), average='weighted')

print("CIFAR-10 F1 Score (Train):", cifar10_f1_score_train)
print("CIFAR-10 F1 Score (Test):", cifar10_f1_score_test)

CIFAR-10 F1 Score (Train): 1.0
CIFAR-10 F1 Score (Test): 0.4319350341407362


# For mushroom dataset

In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('mushrooms.csv')

labelencoder = LabelEncoder()
for column in data.columns:
    data[column] = labelencoder.fit_transform(data[column])

X = data.drop('class', axis=1)  # Features
y = data['class']               # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
predictions = rf_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1257
           1       1.00      1.00      1.00      1181

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438



# For 20news group dataset

In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X = vectorizer.fit_transform(newsgroups_data.data)
y = newsgroups_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred, target_names=newsgroups_data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.53      0.39      0.45       151
           comp.graphics       0.55      0.55      0.55       202
 comp.os.ms-windows.misc       0.58      0.63      0.60       195
comp.sys.ibm.pc.hardware       0.49      0.58      0.53       183
   comp.sys.mac.hardware       0.73      0.62      0.67       205
          comp.windows.x       0.78      0.71      0.74       215
            misc.forsale       0.71      0.65      0.68       193
               rec.autos       0.42      0.70      0.53       196
         rec.motorcycles       0.59      0.62      0.61       168
      rec.sport.baseball       0.71      0.72      0.71       211
        rec.sport.hockey       0.79      0.81      0.80       198
               sci.crypt       0.80      0.68      0.73       201
         sci.electronics       0.51      0.47      0.49       202
                 sci.med       0.71      0.74      0.73       194
         