**Import Required Libraries**

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import fetch_20newsgroups

**Load and Prepare Dataset**

In [18]:
# Load the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
X = newsgroups_data.data
y = newsgroups_data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Decision Tree Classifier**

In [19]:
# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier
dt_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_dt = dt_classifier.predict(X_test_tfidf)

# Evaluate the performance
print("Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Classifier
Accuracy: 0.61726211531659
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.51      0.50       236
           1       0.47      0.45      0.46       287
           2       0.60      0.62      0.61       290
           3       0.44      0.49      0.47       285
           4       0.60      0.53      0.56       312
           5       0.56      0.56      0.56       308
           6       0.71      0.68      0.70       276
           7       0.63      0.67      0.65       304
           8       0.77      0.74      0.75       279
           9       0.60      0.62      0.61       308
          10       0.73      0.70      0.72       309
          11       0.80      0.77      0.78       290
          12       0.45      0.41      0.43       304
          13       0.61      0.65      0.63       300
          14       0.72      0.71      0.72       297
          15       0.73      0.69      0.71       292
     

**Naive Bayes Classifier**

In [20]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance
print("Naive Bayes Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Classifier
Accuracy: 0.8363990095507605
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.70      0.78       236
           1       0.84      0.80      0.82       287
           2       0.86      0.82      0.84       290
           3       0.63      0.84      0.72       285
           4       0.96      0.79      0.87       312
           5       0.96      0.78      0.86       308
           6       0.92      0.67      0.78       276
           7       0.90      0.90      0.90       304
           8       0.93      0.95      0.94       279
           9       0.96      0.94      0.95       308
          10       0.91      0.97      0.94       309
          11       0.76      0.98      0.86       290
          12       0.91      0.77      0.83       304
          13       0.97      0.88      0.92       300
          14       0.89      0.97      0.93       297
          15       0.49      0.99      0.65       292
     

**Cross-Validation**

In [21]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search_dt.fit(X_train_tfidf, y_train)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", grid_search_dt.best_params_)

# Predict and evaluate with best estimator
best_dt_classifier = grid_search_dt.best_estimator_
y_pred_best_dt = best_dt_classifier.predict(X_test_tfidf)

print("Tuned Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_best_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_best_dt))

Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuned Decision Tree Classifier
Accuracy: 0.61726211531659
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.51      0.50       236
           1       0.47      0.45      0.46       287
           2       0.60      0.62      0.61       290
           3       0.44      0.49      0.47       285
           4       0.60      0.53      0.56       312
           5       0.56      0.56      0.56       308
           6       0.71      0.68      0.70       276
           7       0.63      0.67      0.65       304
           8       0.77      0.74      0.75       279
           9       0.60      0.62      0.61       308
          10       0.73      0.70      0.72       309
          11       0.80      0.77      0.78       290
          12       0.45      0.41      0.43       304
          13       0.61      0.65      0.63       300
     

**Parameter Tuning**

In [22]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search_dt.fit(X_train_tfidf, y_train)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", grid_search_dt.best_params_)

# Predict and evaluate with best estimator
best_dt_classifier = grid_search_dt.best_estimator_
y_pred_best_dt = best_dt_classifier.predict(X_test_tfidf)

print("Tuned Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_best_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_best_dt))

Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuned Decision Tree Classifier
Accuracy: 0.61726211531659
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.51      0.50       236
           1       0.47      0.45      0.46       287
           2       0.60      0.62      0.61       290
           3       0.44      0.49      0.47       285
           4       0.60      0.53      0.56       312
           5       0.56      0.56      0.56       308
           6       0.71      0.68      0.70       276
           7       0.63      0.67      0.65       304
           8       0.77      0.74      0.75       279
           9       0.60      0.62      0.61       308
          10       0.73      0.70      0.72       309
          11       0.80      0.77      0.78       290
          12       0.45      0.41      0.43       304
          13       0.61      0.65      0.63       300
     