<a href="https://colab.research.google.com/github/AleenaPalakkal/DS-TUT/blob/main/exp4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score

# Load 20 Newsgroups Dataset
newsgroups = fetch_20newsgroups(subset='all')
X_newsgroups = newsgroups.data
y_newsgroups = newsgroups.target

# Custom stop words (instead of 'english')
custom_stop_words = list(ENGLISH_STOP_WORDS)  # Alternative predefined stop words list

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(
    stop_words=custom_stop_words,  # Replaced 'english' with a custom stop word list
    max_features=50000,
    ngram_range=(1, 2),
    sublinear_tf=True
)
X_newsgroups_tfidf = vectorizer.fit_transform(X_newsgroups)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_newsgroups_tfidf, y_newsgroups, test_size=0.3, random_state=42
)

# Number of models in the ensemble
n_models = 5

# Create multiple Random Forest classifiers
models = [
    (f"rf_{i}", RandomForestClassifier(n_estimators=100, random_state=100 + i, n_jobs=-1, bootstrap=False))
    for i in range(n_models)
]

# Use VotingClassifier for ensemble learning
voting_clf = VotingClassifier(estimators=models, voting='hard')
voting_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"20 Newsgroups Dataset - Ensemble Random Forest Accuracy: {accuracy * 100:.4f}%")


20 Newsgroups Dataset - Ensemble Random Forest Accuracy: 86.8589%
