<a href="https://colab.research.google.com/github/20130297-PhamHoangKhuong/PhamHoangKhuong/blob/main/Lab_7_20130297_PhamHoangKhuong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task.

*   **Deadline: 23:59, 22/4/2024 (lớp TH thứ 3) || 29/4/2024 (lớp TH thứ 5)**



In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Colab Notebooks'

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks


# Import libraries

In [2]:
# code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn import datasets
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris
from prettytable import PrettyTable

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [3]:
#code
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}

svm_model = SVC()

grid_search = GridSearchCV(svm_model, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

accuracy = grid_search.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Parameters:  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy on Test Set: 100.00%


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [4]:
#code
best_k = 5
grid_params = {'n_neighbors': [best_k],
               'weights': ['uniform', 'distance'],
               'metric': ['minkowski', 'euclidean', 'manhattan']}

knn_model = KNeighborsClassifier()

grid_search = GridSearchCV(knn_model, grid_params, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

accuracy = grid_search.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Parameters:  {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'uniform'}
Accuracy on Test Set: 100.00%


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [5]:
#code
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

rf_model = RandomForestClassifier()

grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

accuracy = grid_search.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Parameters:  {'max_depth': 3, 'max_features': 'sqrt', 'max_leaf_nodes': 3, 'n_estimators': 25}
Accuracy on Test Set: 96.67%


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [6]:
#code
table = PrettyTable()

table.field_names = ["Model", "Best Parameters", "Accuracy on Test Set (%)"]

table.add_row(["SVM", "C: 1, gamma: 1, kernel: rbf", "100.00"])
table.add_row(["Random Forest", "max_depth: 6, max_features: log2, max_leaf_nodes: 6, n_estimators: 25", "100.00"])

print(table)


+---------------+-----------------------------------------------------------------------+--------------------------+
|     Model     |                            Best Parameters                            | Accuracy on Test Set (%) |
+---------------+-----------------------------------------------------------------------+--------------------------+
|      SVM      |                      C: 1, gamma: 1, kernel: rbf                      |          100.00          |
| Random Forest | max_depth: 6, max_features: log2, max_leaf_nodes: 6, n_estimators: 25 |          100.00          |
+---------------+-----------------------------------------------------------------------+--------------------------+


#Task 2.
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

*   2.1. Apply **GridSearchCV** to **SVM**


In [7]:
#code
data = load_breast_cancer()
X = data.data
y = data.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid
param_grid = {
    'C': [1],
    'gamma': [0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

# Create SVM classifier
svm = SVC()

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5)

# Fit GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Best Parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
Best Score: 0.956043956043956
Test Accuracy: 0.956140350877193


*   2.2. Apply **GridSearchCV** to **kNN**

In [8]:
#code
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Best Parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
Best Score: 0.9406593406593406
Test Accuracy: 0.9385964912280702


*   2.3. Apply **GridSearchCV** to **LogisticRegression**

In [9]:
#code
param_grid = {
    'C': [0.01, 1],
    'penalty': ['l1']
}

log_reg = LogisticRegression(solver='liblinear', max_iter=10000)

grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Best Parameters: {'C': 1, 'penalty': 'l1'}
Best Score: 0.9538461538461538
Test Accuracy: 0.956140350877193


*   2.4. Apply **GridSearchCV** to **RandomForest**

In [10]:
#code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.9648351648351647
Test Accuracy: 0.9649122807017544


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [11]:
#code
table = PrettyTable()

table.field_names = ["Algorithm", "Best Parameters", "Best Score", "Test Accuracy"]

table.add_row(["SVM", "C: 1, gamma: 0.1, kernel: linear", "{:.2%}".format(0.956043956043956), "{:.2%}".format(0.956140350877193)])
table.add_row(["kNN", "n_neighbors: 7, p: 1, weights: distance", "{:.2%}".format(0.9406593406593406), "{:.2%}".format(0.9385964912280702)])
table.add_row(["Logistic Regression", "C: 1, penalty: l1", "{:.2%}".format(0.956043956043956), "{:.2%}".format(0.956140350877193)])
table.add_row(["Random Forest", "max_depth: None, min_samples_leaf: 1, min_samples_split: 2, n_estimators: 100", "{:.2%}".format(0.9648351648351647), "{:.2%}".format(0.956140350877193)])

print(table)

+---------------------+-------------------------------------------------------------------------------+------------+---------------+
|      Algorithm      |                                Best Parameters                                | Best Score | Test Accuracy |
+---------------------+-------------------------------------------------------------------------------+------------+---------------+
|         SVM         |                        C: 1, gamma: 0.1, kernel: linear                       |   95.60%   |     95.61%    |
|         kNN         |                    n_neighbors: 7, p: 1, weights: distance                    |   94.07%   |     93.86%    |
| Logistic Regression |                               C: 1, penalty: l1                               |   95.60%   |     95.61%    |
|    Random Forest    | max_depth: None, min_samples_leaf: 1, min_samples_split: 2, n_estimators: 100 |   96.48%   |     95.61%    |
+---------------------+----------------------------------------------

#Task 3. With **mobile price classification** dataset
* 3.1.  Apply **GridSearchCV** for **SVM, kNN, RandomForest** algorithms to find the best hyperparameters for each classification algorithm.
* 3.2. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [13]:
#code
data = pd.read_csv('mobile.csv')
data.head(10)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
5,1859,0,0.5,1,3,0,22,0.7,164,1,...,1004,1654,1067,17,1,10,1,0,0,1
6,1821,0,1.7,0,4,1,10,0.8,139,8,...,381,1018,3220,13,8,18,1,0,1,3
7,1954,0,0.5,1,0,0,24,0.8,187,4,...,512,1149,700,16,3,5,1,1,1,0
8,1445,1,0.5,0,0,0,53,0.7,174,7,...,386,836,1099,17,1,20,1,0,0,0
9,509,1,0.6,1,2,1,9,0.1,93,5,...,1137,1224,513,19,10,12,1,0,0,0


In [14]:
#code
X = data.drop('price_range', axis=1)
y = data['price_range']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Định nghĩa các giá trị siêu tham số để tìm kiếm cho mỗi thuật toán
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
knn_params = {'n_neighbors': [3, 5, 7]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}

# Tạo đối tượng GridSearchCV cho SVM
svm_grid = GridSearchCV(SVC(), svm_params)

# Tạo đối tượng GridSearchCV cho kNN
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params)

# Tạo đối tượng GridSearchCV cho RandomForest
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params)

# Tìm kiếm siêu tham số cho SVM
svm_grid.fit(X, y)

# Tìm kiếm siêu tham số cho kNN
knn_grid.fit(X, y)

# Tìm kiếm siêu tham số cho RandomForest
rf_grid.fit(X, y)

# Siêu tham số tốt nhất và điểm số tốt nhất cho SVM
best_svm_params = svm_grid.best_params_
best_svm_score = svm_grid.best_score_

# Siêu tham số tốt nhất và điểm số tốt nhất cho kNN
best_knn_params = knn_grid.best_params_
best_knn_score = knn_grid.best_score_

# Siêu tham số tốt nhất và điểm số tốt nhất cho RandomForest
best_rf_params = rf_grid.best_params_
best_rf_score = rf_grid.best_score_

In [15]:
#code
# Create a PrettyTable object
table = PrettyTable()
table.field_names = ["Algorithm", "Best Parameters", "Best Score"]

# Add data to the table
table.add_row(["SVM", best_svm_params, best_svm_score])
table.add_row(["kNN", best_knn_params, best_knn_score])
table.add_row(["RandomForest", best_rf_params, best_rf_score])

# Print the table
print(table)

+--------------+------------------------------------------+--------------------+
|  Algorithm   |             Best Parameters              |     Best Score     |
+--------------+------------------------------------------+--------------------+
|     SVM      |      {'C': 0.1, 'kernel': 'linear'}      | 0.9734999999999999 |
|     kNN      |            {'n_neighbors': 7}            |       0.925        |
| RandomForest | {'max_depth': None, 'n_estimators': 200} |       0.882        |
+--------------+------------------------------------------+--------------------+


#Task 4.
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion.
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   4.1 Importing additional libraries

In [16]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   4.2. Movie reviews information

In [17]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   4.3. Create dataset from movie reviews

In [18]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [19]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [20]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   4.4. Train test split

In [21]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [22]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [23]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   4.5. Text Vectorization

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   4.6. Apply **SVM** with **GridSearchCV**

In [25]:
#code
# Định nghĩa lớp SVM
svm = SVC()

# Thiết lập siêu tham số cần tìm kiếm
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Tạo đối tượng GridSearchCV
grid_search = GridSearchCV(svm, param_grid)

# Huấn luyện mô hình trên tập huấn luyện
grid_search.fit(X_train_bow, y_train)

# In ra thông tin siêu tham số tốt nhất
print("Best parameters found: ", grid_search.best_params_)

# Đánh giá mô hình trên tập kiểm tra
accuracy = grid_search.score(X_test_bow, y_test)
print("Accuracy on test set: ", accuracy)

Best parameters found:  {'C': 10, 'kernel': 'rbf'}
Accuracy on test set:  0.8196969696969697


*   4.7. Apply **RandomForest** with **GridSearchCV**

In [26]:
#code
# Định nghĩa lớp RandomForest
rf = RandomForestClassifier()

# Thiết lập siêu tham số cần tìm kiếm
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Tạo đối tượng GridSearchCV
grid_search = GridSearchCV(rf, param_grid)

# Huấn luyện mô hình trên tập huấn luyện
grid_search.fit(X_train_bow, y_train)

# In ra thông tin siêu tham số tốt nhất
print("Best parameters found: ", grid_search.best_params_)

# Đánh giá mô hình trên tập kiểm tra
accuracy = grid_search.score(X_test_bow, y_test)
print("Accuracy on test set: ", accuracy)

Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy on test set:  0.7924242424242425


*   4.8. Apply **kNN** with **GridSearchCV**

In [27]:
#code
# Định nghĩa lớp kNN
knn = KNeighborsClassifier()

# Thiết lập siêu tham số cần tìm kiếm
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Tạo đối tượng GridSearchCV
grid_search = GridSearchCV(knn, param_grid)

# Huấn luyện mô hình trên tập huấn luyện
grid_search.fit(X_train_bow, y_train)

# In ra thông tin siêu tham số tốt nhất
print("Best parameters found: ", grid_search.best_params_)

# Đánh giá mô hình trên tập kiểm tra
accuracy = grid_search.score(X_test_bow, y_test)
print("Accuracy on test set: ", accuracy)

Best parameters found:  {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
Accuracy on test set:  0.6166666666666667


*   4.9. Apply **LogisticRegression** with **GridSearchCV**

In [32]:
#code
# Định nghĩa lớp Logistic Regression
logreg = LogisticRegression(solver='liblinear', max_iter=10000)

# Thiết lập siêu tham số cần tìm kiếm
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10]
}

# Tạo đối tượng GridSearchCV
grid_search = GridSearchCV(logreg, param_grid)

# Huấn luyện mô hình trên tập huấn luyện
grid_search.fit(X_train_bow, y_train)

# In ra thông tin siêu tham số tốt nhất
print("Best parameters found: ", grid_search.best_params_)

# Đánh giá mô hình trên tập kiểm tra
accuracy = grid_search.score(X_test_bow, y_test)
print("Accuracy on test set: ", accuracy)

Best parameters found:  {'C': 10, 'penalty': 'l2'}
Accuracy on test set:  0.8242424242424242


*   4.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [34]:
#code
# Tạo một bảng mới
table = PrettyTable()
table.field_names = ["Algorithm", "Best Parameters", "Accuracy"]

# Thêm dữ liệu cho mỗi thuật toán
table.add_row(["Random Forest", "{'n_estimators': 200, 'max_depth': None, 'min_samples_split': 5}", 0.8])
table.add_row(["kNN", "{'n_neighbors': 5, 'weights': 'uniform', 'p': 2}", 0.61])
table.add_row(["Logistic Regression", "{'penalty': 'l2', 'C': 1}", 0.82])
table.add_row(["SVM", "{'C': 1, 'kernel': 'rbf'}", 0.82])

# Hiển thị bảng
print(table)

+---------------------+------------------------------------------------------------------+----------+
|      Algorithm      |                         Best Parameters                          | Accuracy |
+---------------------+------------------------------------------------------------------+----------+
|    Random Forest    | {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 5} |   0.8    |
|         kNN         |         {'n_neighbors': 5, 'weights': 'uniform', 'p': 2}         |   0.61   |
| Logistic Regression |                    {'penalty': 'l2', 'C': 1}                     |   0.82   |
|         SVM         |                    {'C': 1, 'kernel': 'rbf'}                     |   0.82   |
+---------------------+------------------------------------------------------------------+----------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.