<a href="https://colab.research.google.com/github/Danny2611/LAB-ML/blob/master/Lab_7_21130584_LeQuocTrung.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task.

*   **Deadline: 23:59, 22/4/2024 (lớp TH thứ 3) || 29/4/2024 (lớp TH thứ 5)**



# Import libraries

In [None]:
# code
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer




#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [None]:

from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report


iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}

svm = SVC()


grid_search = GridSearchCV(svm, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters found:
{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
#code
from sklearn.neighbors import KNeighborsClassifier
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_params = {'n_neighbors': [5, 7, 9, 11, 13, 15],
               'weights': ['uniform', 'distance'],
               'metric': ['minkowski', 'euclidean', 'manhattan']}

knn = KNeighborsClassifier()


grid_search_knn = GridSearchCV(knn, grid_params, cv=5, verbose=2, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)


print("Best hyperparameters found:")
print(grid_search_knn.best_params_)


best_model_knn = grid_search_knn.best_estimator_
y_pred_knn = best_model_knn.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparameters found:
{'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:

from sklearn.ensemble import RandomForestClassifier


iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

rf = RandomForestClassifier()


grid_search = GridSearchCV(rf, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters found:
{'max_depth': 3, 'max_features': 'sqrt', 'max_leaf_nodes': 6, 'n_estimators': 25}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

#Task 2.
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

*   2.1. Apply **GridSearchCV** to **SVM**


In [None]:
# code


cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf', 'linear']}




svm = SVC()


grid_search = GridSearchCV(svm, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best hyperparameters found:
{'C': 1, 'gamma': 1, 'kernel': 'linear'}

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



*   2.2. Apply **GridSearchCV** to **kNN**

In [None]:
#code



cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'n_neighbors': [5, 7, 9],
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan']}




knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best hyperparameters found:
{'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        43
           1       0.92      0.99      0.95        71

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



*   2.3. Apply **GridSearchCV** to **LogisticRegression**

In [None]:
#code
#code

from sklearn.linear_model import LogisticRegression

cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'penalty': ['l1', 'l2']}




logistic_reg = LogisticRegression(max_iter=10000)

grid_search = GridSearchCV(logistic_reg, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Fitting 5 folds for each of 14 candidates, totalling 70 fits


35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.95384615        nan 0.96

Best hyperparameters found:
{'C': 10, 'penalty': 'l2'}

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



*   2.4. Apply **GridSearchCV** to **RandomForest**

In [None]:

#code

from sklearn.ensemble import RandomForestClassifier

cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'n_estimators': [50, 100, 150],
              'max_features': ['sqrt', 'log2', None],
              'max_depth': [3, 6, 9],
              'max_leaf_nodes': [3, 6, 9]}




random_forest = RandomForestClassifier()

grid_search = GridSearchCV(random_forest, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best hyperparameters found:
{'max_depth': 9, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 50}

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
#code
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from prettytable import PrettyTable


cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


results = []


classifiers = {
    'SVM': SVC(),
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Random Forest': RandomForestClassifier()
}


param_grids = {
    'SVM': {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'kNN': {'n_neighbors': [3, 5, 7, 9]},
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [3, 6, None]}
}


for classifier_name, classifier in classifiers.items():
    param_grid = param_grids[classifier_name]
    grid_search = GridSearchCV(classifier, param_grid, cv=5, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append([classifier_name, best_params, accuracy])


table = PrettyTable()
table.field_names = ['Algorithm', 'Best Parameters', 'Accuracy']

for result in results:
    table.add_row(result)


print(table)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.95384615        nan 0.96

Fitting 5 folds for each of 9 candidates, totalling 45 fits
+---------------------+------------------------------------------+--------------------+
|      Algorithm      |             Best Parameters              |      Accuracy      |
+---------------------+------------------------------------------+--------------------+
|         SVM         |       {'C': 1, 'kernel': 'linear'}       | 0.956140350877193  |
|         kNN         |            {'n_neighbors': 9}            | 0.956140350877193  |
| Logistic Regression |        {'C': 10, 'penalty': 'l2'}        | 0.956140350877193  |
|    Random Forest    | {'max_depth': None, 'n_estimators': 100} | 0.9649122807017544 |
+---------------------+------------------------------------------+--------------------+


#Task 3. With **mobile price classification** dataset
* 3.1.  Apply **GridSearchCV** for **SVM, kNN, RandomForest** algorithms to find the best hyperparameters for each classification algorithm.
* 3.2. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable
import pandas as pd

def myGridSearchCV(X, y, classifier, param_grid):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    grid_search = GridSearchCV(classifier, param_grid, cv=5, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return classifier.__class__.__name__, best_params, grid_search.best_score_, accuracy, precision, recall, f1

mobile = pd.read_csv("/content/sample_data/mobile.csv")
mobile_sample = mobile.sample(frac=0.1, random_state=42)
X = mobile_sample.drop(columns=['price_range'])
y = mobile_sample['price_range']

svm_param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf', 'linear']}
knn_param_grid = {'n_neighbors': [5, 7, 9, 11], 'weights': ['uniform', 'distance'], 'metric': ['minkowski', 'euclidean']}
rf_param_grid = {'n_estimators': [25, 50, 75], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 6, 9], 'max_leaf_nodes': [3, 6, 9]}

classifiers = [
    (SVC(), svm_param_grid),
    (KNeighborsClassifier(), knn_param_grid),
    (RandomForestClassifier(), rf_param_grid)
]

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

for classifier, param_grid in classifiers:
    algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, classifier, param_grid)
    print(f"Classifier: {algoName}, Params: {best_params}")
    table.add_row([algoName, best_score, accuracy, precision, recall, f1])

print(table)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Classifier: SVC, Params: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Classifier: KNeighborsClassifier, Params: {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'uniform'}
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Classifier: RandomForestClassifier, Params: {'max_depth': 9, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 75}
+------------------------+------------+----------+--------------------+--------+--------------------+
|       Classifier       | Best_Score | Accuracy |     Precision      | Recall |      F1_Score      |
+------------------------+------------+----------+--------------------+--------+--------------------+
|          SVC           |  0.94375   |   0.95   | 0.9576923076923076 |  0.95  | 0.9498106060606061 |
|  KNeighborsClassifier  |   0.8875   |   0.85   | 0.8558712121212121 |  0.85  | 0.8507551487414187 |
| Ran

#Task 4.
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion.
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   4.1 Importing additional libraries

In [None]:
import nltk, random
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   4.2. Movie reviews information

In [None]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   4.3. Create dataset from movie reviews

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [None]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [None]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   4.4. Train test split

In [None]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [None]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [None]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   4.5. Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   4.6. Apply **SVM** with **GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable

X_train_small, _, y_train_small, _ = train_test_split(X_train_bow, y_train, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

svm_grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    scoring='accuracy',
    refit=True,
    cv=5,
    return_train_score=True,
    n_jobs=-1
)

svm_grid_search.fit(X_train_small, y_train_small)

y_pred = svm_grid_search.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = svm_grid_search.best_params_
best_score = svm_grid_search.best_score_

print("Best parameters for SVM:", best_params)

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row(["SVM with GridSearchCV", best_score, accuracy, precision, recall, f1])
print(table)


Best parameters for SVM: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
+-----------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|       Classifier      |     Best_Score     |      Accuracy      |     Precision     |       Recall       |      F1_Score      |
+-----------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
| SVM with GridSearchCV | 0.8563573136274722 | 0.8287878787878787 | 0.829531320539161 | 0.8284871973843724 | 0.8285797030829449 |
+-----------------------+--------------------+--------------------+-------------------+--------------------+--------------------+


*   4.7. Apply **RandomForest** with **GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable

X_train_small, _, y_train_small, _ = train_test_split(X_train_bow, y_train, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9]
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    refit=True,
    cv=5,
    return_train_score=True,
    n_jobs=-1
)

rf_grid_search.fit(X_train_small, y_train_small)

y_pred = rf_grid_search.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = rf_grid_search.best_params_
best_score = rf_grid_search.best_score_

print("Best parameters for RandomForest:", best_params)

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row(["RandomForest with GridSearchCV", best_score, accuracy, precision, recall, f1])
print(table)


Best parameters for RandomForest: {'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 150}
+--------------------------------+--------------------+--------------------+-------------------+-------------------+--------------------+
|           Classifier           |     Best_Score     |      Accuracy      |     Precision     |       Recall      |      F1_Score      |
+--------------------------------+--------------------+--------------------+-------------------+-------------------+--------------------+
| RandomForest with GridSearchCV | 0.8031514888067811 | 0.7712121212121212 | 0.773339455157637 | 0.770645824914588 | 0.7704908217054978 |
+--------------------------------+--------------------+--------------------+-------------------+-------------------+--------------------+


*   4.8. Apply **kNN** with **GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable

X_train_small, _, y_train_small, _ = train_test_split(X_train_bow, y_train, test_size=0.1, random_state=42)

param_grid = {
    'n_neighbors': [5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}

knn_grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    refit=True,
    cv=5,
    return_train_score=True,
    n_jobs=-1
)

knn_grid_search.fit(X_train_small, y_train_small)

y_pred = knn_grid_search.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = knn_grid_search.best_params_
best_score = knn_grid_search.best_score_

print("Best parameters for kNN:", best_params)

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row(["kNN with GridSearchCV", best_score, accuracy, precision, recall, f1])
print(table)


*   4.9. Apply **LogisticRegression** with **GridSearchCV**

In [None]:
#code
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable

X_train_small, _, y_train_small, _ = train_test_split(X_train_bow, y_train, test_size=0.1, random_state=42)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
}

# Create a GridSearchCV object
log_reg_grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    scoring='accuracy',
    refit=True,
    cv=5,
    return_train_score=True,
    n_jobs=-1
)

log_reg_grid_search.fit(X_train_small, y_train_small)

y_pred = log_reg_grid_search.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = log_reg_grid_search.best_params_
best_score = log_reg_grid_search.best_score_

print("Best parameters for Logistic Regression:", best_params)

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row(["Logistic Regression with GridSearchCV", best_score, accuracy, precision, recall, f1])
print(table)


Best parameters for Logistic Regression: {'C': 100, 'solver': 'sag'}
+---------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Classifier              |     Best_Score     |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+---------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| Logistic Regression with GridSearchCV | 0.8582284558142727 | 0.8212121212121212 | 0.8224770134042317 | 0.8208184857279306 | 0.8208897556667648 |
+---------------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+




*   4.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable
from sklearn.svm import SVC

table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

table.add_row(["SVM", svm_best_score, svm_accuracy, svm_precision, svm_recall, svm_f1])
table.add_row(["kNN", knn_best_score, knn_accuracy, knn_precision, knn_recall, knn_f1])
table.add_row(["Logistic Regression", log_reg_best_score, log_reg_accuracy, log_reg_precision, log_reg_recall, log_reg_f1])

print(table)


#Finally,
Save a copy in your Github. Remember renaming the notebook.