<a href="https://colab.research.google.com/github/Danny2611/LAB-ML/blob/master/Lab8_ML_21130584_LeQuocTrung.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The main aim of this lab is to deal with the **pipeline** technique and **MultilayerPerceptron** algorithm

*   **Deadline: 23:59, 06/5/2024**



# Import libraries

In [None]:
# code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn import metrics
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import CategoricalNB, GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, KBinsDiscretizer, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from prettytable import PrettyTable

warnings.filterwarnings('ignore')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


#Task 1. With **iris** dataset
*  Apply **pipeline** including preprocessing steps (i.e., **StandardScaler**, **SimpleImputer**, **feature selection**, **KBinsDiscretizer**, …) and classification algorithms (i.e., **Random forest, kNN, Naïve Bayes**).


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd


iris = load_iris()
X, y = iris.data, iris.target


models = {
    'RandomForest': RandomForestClassifier(),
    'kNN': KNeighborsClassifier(),
    'NaiveBayes': GaussianNB()
}

preprocessing = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('feature_selection', SelectKBest(k=4)),
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')),
    ('pca', PCA(n_components=2))
])


result_pipelines = my_pipeline(X, y, preprocessing, models)


results = []

for model_name, pipeline in result_pipelines.items():
    y_pred = pipeline.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    results.append([model_name, preprocessing.steps, accuracy, precision, recall, f1])


results_df = pd.DataFrame(results, columns=["Classifier", "Preprocessing Steps", "Accuracy", "Precision", "Recall", "F1_Score"])


print(results_df)


     Classifier                                Preprocessing Steps  Accuracy  \
0  RandomForest  [(scaler, StandardScaler()), (imputer, SimpleI...  0.973333   
1           kNN  [(scaler, StandardScaler()), (imputer, SimpleI...  0.960000   
2    NaiveBayes  [(scaler, StandardScaler()), (imputer, SimpleI...  0.966667   

   Precision    Recall  F1_Score  
0   0.973825  0.973333  0.973323  
1   0.960000  0.960000  0.960000  
2   0.966787  0.966667  0.966663  


#Task 2. With **fashion** dataset
*   2.1. Apply **MultilayerPerceptron** classification with 1 hidden layer
having 10 nodes

In [None]:
# code
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from prettytable import PrettyTable

def my_MLPClassifier(X_train, X_test, y_train, y_test, hidden_layer_sizes):

    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=hidden_layer_sizes, random_state=1)


    clf.fit(X_train, y_train)


    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    table = PrettyTable(["AlgoName", "Hidden layer sizes", "Accuracy", "Precision", "Recall", "F1_Score"])
    table.add_row(['MultilayerPerceptron', hidden_layer_sizes, accuracy, precision, recall, f1])

    return table


data_train = pd.read_csv('/content/sample_data/fashion_train.csv')
data_test = pd.read_csv('/content/sample_data/fashion_test.csv')

X_train = data_train.drop(columns=['y'])
y_train = data_train['y']
X_test = data_test.drop(columns=['y'])
y_test = data_test['y']

clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (10,))
print(clf)


+----------------------+--------------------+----------+---------------------+---------------------+---------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |      Precision      |        Recall       |       F1_Score      |
+----------------------+--------------------+----------+---------------------+---------------------+---------------------+
| MultilayerPerceptron |       (10,)        |  0.148   | 0.08899388048956083 | 0.16263736263736264 | 0.08783442218482124 |
+----------------------+--------------------+----------+---------------------+---------------------+---------------------+


*   2.2. Apply **MultilayerPerceptron** algorithm with the following settings (the first hidden layer has 250 neuron, the second one has 100 neurons).

In [None]:
# code
clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (250, 100))
print(clf)

+----------------------+--------------------+----------+--------------------+-------------------+--------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |     Precision      |       Recall      |      F1_Score      |
+----------------------+--------------------+----------+--------------------+-------------------+--------------------+
| MultilayerPerceptron |     (250, 100)     |  0.762   | 0.7600749070563724 | 0.759490035340887 | 0.7566185000787782 |
+----------------------+--------------------+----------+--------------------+-------------------+--------------------+


*   2.3. Find the best hyperparameters using **GridSearchCV**

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from prettytable import PrettyTable

def myGridSearchCV(X_train, y_train, X_test, y_test, classifier, params):

    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=params,
        scoring='accuracy',
        refit=True,
        cv=10,
        return_train_score=True,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)


    y_pred = grid_search.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')


    algoName = classifier.__class__.__name__

    best_params = grid_search.best_params_

    return algoName, best_params, accuracy, precision, recall, f1


data_train = pd.read_csv('/content/sample_data/fashion_train.csv', nrows=200)
data_test = pd.read_csv('/content/sample_data/fashion_test.csv', nrows=100)


X_train = data_train.drop(columns=['y'])
y_train = data_train['y']
X_test = data_test.drop(columns=['y'])
y_test = data_test['y']


param_grid = {
    'hidden_layer_sizes': [(150, 100, 50), (120, 80, 40), (100, 50, 30)],
    'max_iter': [50, 100, 150],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive'],
}


algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, MLPClassifier(random_state=1), param_grid)

print(f"Classifier: {algoName}, Params: {best_params}")

table = PrettyTable(["Classifier with the best hyperparameters", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row([algoName, accuracy, precision, recall, f1])
print(table)


Classifier: MLPClassifier, Params: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 50), 'learning_rate': 'constant', 'max_iter': 50, 'solver': 'adam'}
+------------------------------------------+----------+--------------------+--------------------+--------------------+
| Classifier with the best hyperparameters | Accuracy |     Precision      |       Recall       |      F1_Score      |
+------------------------------------------+----------+--------------------+--------------------+--------------------+
|              MLPClassifier               |   0.69   | 0.6855555555555555 | 0.6552913752913753 | 0.6619396759051932 |
+------------------------------------------+----------+--------------------+--------------------+--------------------+


*   2.4. Compare the **MultilayerPerceptron** using the best hyperparameters in 2.3 and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [None]:
 fashion_train = pd.read_csv("/content/sample_data/fashion_train.csv")
 fashion_test = pd.read_csv("/content/sample_data/fashion_test.csv")
 X_train = fashion_train.drop(columns="y")
 y_train = fashion_train["y"]
 X_test = fashion_test.drop(columns="y")
 y_test = fashion_test["y"]
 selector = SelectKBest(chi2, k=400)
 X_train_selected = selector.fit_transform(X_train, y_train)
 X_test_selected = selector.transform(X_test)
 mlp_classifier = MLPClassifier(hidden_layer_sizes=(250, 100), activation='relu', solver='adam', max_iter=1000)
 rf_classifier = RandomForestClassifier()
 knn_classifier = KNeighborsClassifier()
 nb_classifier = GaussianNB()
 mlp_classifier.fit(X_train_selected, y_train)
 mlp_y_pred = mlp_classifier.predict(X_test_selected)
 mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
 mlp_precision = precision_score(y_test, mlp_y_pred, average="macro")
 mlp_recall = recall_score(y_test, mlp_y_pred, average="macro")
 mlp_f1 = f1_score(y_test, mlp_y_pred, average="macro")
 rf_classifier.fit(X_train_selected, y_train)
 rf_y_pred = rf_classifier.predict(X_test_selected)
 rf_accuracy = accuracy_score(y_test, rf_y_pred)
 rf_precision = precision_score(y_test, rf_y_pred, average="macro")
 rf_recall = recall_score(y_test, rf_y_pred, average="macro")
 rf_f1 = f1_score(y_test, rf_y_pred, average="macro")
 knn_classifier.fit(X_train_selected, y_train)
 knn_y_pred = knn_classifier.predict(X_test_selected)
 knn_accuracy = accuracy_score(y_test, knn_y_pred)
 knn_precision = precision_score(y_test, knn_y_pred, average="macro")
 knn_recall = recall_score(y_test, knn_y_pred, average="macro")
 knn_f1 = f1_score(y_test, knn_y_pred, average="macro")
 nb_classifier.fit(X_train_selected, y_train)
 nb_y_pred = nb_classifier.predict(X_test_selected)
 nb_accuracy = accuracy_score(y_test, nb_y_pred)
 nb_precision = precision_score(y_test, nb_y_pred, average="macro")
 nb_recall = recall_score(y_test, nb_y_pred, average="macro")
 nb_f1 = f1_score(y_test, nb_y_pred, average="macro")
 table = PrettyTable(["Algorithm", "Accuracy", "Precision", "Recall", "F1"])
 table.add_row(["MLP Classifier", mlp_accuracy, mlp_precision, mlp_recall, mlp_f1])
 table.add_row(["Random Forest", rf_accuracy, rf_precision, rf_recall, rf_f1])
 table.add_row(["k-NN", knn_accuracy, knn_precision, knn_recall, knn_f1])
 table.add_row(["Naïve Bayes", nb_accuracy, nb_precision, nb_recall, nb_f1])
 print(table)

+----------------+----------+--------------------+--------------------+--------------------+
|   Algorithm    | Accuracy |     Precision      |       Recall       |         F1         |
+----------------+----------+--------------------+--------------------+--------------------+
| MLP Classifier |  0.751   | 0.7514705690391223 | 0.7482445859801603 | 0.7480212785189385 |
| Random Forest  |  0.809   | 0.8016443835270527 | 0.8060159063001743 | 0.8005314796665488 |
|      k-NN      |  0.773   | 0.7764817768116947 | 0.7722676691083874 | 0.7650305741237794 |
|  Naïve Bayes   |  0.583   | 0.6004354166274228 | 0.5856646244149746 | 0.5492472873074626 |
+----------------+----------+--------------------+--------------------+--------------------+


#Task 3. With **breast cancer** dataset

*   3.1. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [None]:
#code
data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'max_iter': [1000, 2000]
}

grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=param_grid, n_jobs=-1, cv=3)
grid_search.fit(X_train, y_train)

best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Estimator:")
print(best_estimator)
print("Best Hyperparameters:")
print(best_params)

Best Estimator:
MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 50), max_iter=2000)
Best Hyperparameters:
{'activation': 'tanh', 'hidden_layer_sizes': (100, 50), 'max_iter': 2000, 'solver': 'adam'}


*   3.2. Compare the **MultilayerPerceptron** using the best hyperparameters in 3.1) and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [None]:
#code
best_mlp_params = {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'max_iter': 1000}
mlp_classifier = MLPClassifier(**best_mlp_params)
rf_classifier = RandomForestClassifier()
knn_classifier = KNeighborsClassifier()
nb_classifier = GaussianNB()

mlp_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
nb_classifier.fit(X_train, y_train)

mlp_y_pred = mlp_classifier.predict(X_test)
rf_y_pred = rf_classifier.predict(X_test)
knn_y_pred = knn_classifier.predict(X_test)
nb_y_pred = nb_classifier.predict(X_test)

mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
mlp_precision = precision_score(y_test, mlp_y_pred)
mlp_recall = recall_score(y_test, mlp_y_pred)
mlp_f1 = f1_score(y_test, mlp_y_pred)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_precision = precision_score(y_test, knn_y_pred)
knn_recall = recall_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred)

nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_precision = precision_score(y_test, nb_y_pred)
nb_recall = recall_score(y_test, nb_y_pred)
nb_f1 = f1_score(y_test, nb_y_pred)

results = pd.DataFrame({
    'Algorithm': ['MLP', 'Random Forest', 'kNN', 'Naïve Bayes'],
    'Accuracy': [mlp_accuracy, rf_accuracy, knn_accuracy, nb_accuracy],
    'Precision': [mlp_precision, rf_precision, knn_precision, nb_precision],
    'Recall': [mlp_recall, rf_recall, knn_recall, nb_recall],
    'F1_Score': [mlp_f1, rf_f1, knn_f1, nb_f1]
})

print(results)

       Algorithm  Accuracy  Precision    Recall  F1_Score
0            MLP  0.964912   0.958904  0.985915  0.972222
1  Random Forest  0.964912   0.958904  0.985915  0.972222
2            kNN  0.956140   0.934211  1.000000  0.965986
3    Naïve Bayes  0.973684   0.959459  1.000000  0.979310


#Task 4. With **mobile price classification** dataset


*   4.1. Build your own Neural Network using **MultilayerPerceptron**  



In [None]:
#code
mobile = pd.read_csv("/content/sample_data/mobile.csv")

X = mobile.drop(columns="price_range")
y = mobile["price_range"]

selector = SelectKBest(chi2, k=10)
X_selected = selector.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

myMLP = MLPClassifier(max_iter=10000, hidden_layer_sizes=(200, 100, 20))
myMLP.fit(X_train, y_train)

y_pred = myMLP.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

table = PrettyTable(["Algorithm", "Accuracy", "Precision", "Recall", "F1"])
table.add_row(["MLP Classifier", accuracy, precision, recall, f1])
print(table)

+----------------+----------+--------------------+--------------------+--------------------+
|   Algorithm    | Accuracy |     Precision      |       Recall       |         F1         |
+----------------+----------+--------------------+--------------------+--------------------+
| MLP Classifier |  0.5175  | 0.6978951229548137 | 0.5372924629718108 | 0.5045679104656468 |
+----------------+----------+--------------------+--------------------+--------------------+


*   4.2. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [None]:
#code
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

mobile = pd.read_csv("/content/sample_data/mobile.csv")

X = mobile.drop(columns="price_range")
y = mobile["price_range"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'max_iter': [1000, 2000]
}

mlp_classifier = MLPClassifier()

grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Estimator:")
print(best_estimator)
print("Best Hyperparameters:")
print(best_params)

y_pred = best_estimator.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Best Estimator:
MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 50), max_iter=1000)
Best Hyperparameters:
{'activation': 'tanh', 'hidden_layer_sizes': (100, 50), 'max_iter': 1000, 'solver': 'adam'}
Accuracy: 0.6575
Precision: 0.652854568287577
Recall: 0.6480953376333811
F1 Score: 0.6492847275804168


#Finally,
Save a copy in your Github. Remember renaming the notebook.