### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #4

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from scipy.stats import uniform

In [2]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

Xtrain = train.data[:3000]
ytrain = train.target[:3000]
Xtest = test.data[:500]
ytest = test.target[:500]

print("X:", len(Xtest))
print("y:", len(ytest))

X: 500
y: 500


### Naive Bayes [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [3]:
pipe_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB(alpha=1.0))
])

pipe_nb.fit(Xtrain, ytrain)

y_pred = pipe_nb.predict(Xtest)

print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.38      0.48        21
           1       0.79      0.52      0.63        21
           2       0.58      0.69      0.63        26
           3       0.74      0.68      0.71        34
           4       0.72      0.85      0.78        34
           5       0.88      0.81      0.84        26
           6       1.00      0.73      0.84        22
           7       0.70      1.00      0.82        28
           8       0.90      0.82      0.86        33
           9       0.88      0.84      0.86        25
          10       0.82      1.00      0.90        27
          11       0.79      0.95      0.86        20
          12       0.59      0.54      0.57        24
          13       0.75      0.78      0.77        23
          14       0.87      0.71      0.78        28
          15       0.53      0.90      0.67        29
          16       0.50      0.95      0.66        21
          17       0.94    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Search Cross-Validation [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

### Uniform distribution in `Scipy` [(Documentation)](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html)

In [4]:
param_dist = {
    'nb__alpha': uniform(0.0001, 10)
}

In [None]:
random_search = RandomizedSearchCV(
    Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('nb', MultinomialNB())
    ]),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(Xtrain, ytrain)

print("Best alpha:", random_search.best_params_['nb__alpha'])

#### Exercise

1. For the Naive Bayes model, use grid search 5-fold cross-validation across different values of `alpha` to find the best model.

2. For the best value of `alpha`, compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* What is the model's `f1_macro` score?

3. Repeat Exercise 1 and 2 for **random search** 5-fold cross validation across different values of `alpha`. Compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* Did you get a better `f1_macro` score compared to grid search in Exercise 2?

In [None]:
#1
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

param_grid = {
    'nb__alpha': [0.001, 0.01, 0.1, 1, 10]
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

grid.fit(Xtrain, ytrain)

print("Best alpha (Grid Search):", grid.best_params_['nb__alpha'])

In [None]:
#2
y_pred_grid = grid.predict(Xtest)

print("Grid Search f1_macro on test set:")
print(classification_report(ytest, y_pred_grid))

In [None]:
#3
param_dist = {
    'nb__alpha': uniform(0.0001, 10)
}

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(Xtrain, ytrain)

print("Best alpha (Random Search):", random_search.best_params_['nb__alpha'])

In [None]:
#4
y_pred_random = random_search.predict(Xtest)

print("Random Search f1_macro on test set:")
print(classification_report(ytest, y_pred_random))