# AI Capstone: Project 1
## *with a self-made dataset*
Author: 0816066 官澔恩

Data Source: [BBC News](https://www.bbc.com/news)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, ParameterGrid
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Data Preprocessing

In [2]:
data_dir = '../input/collection-of-bbc-news'
cats = ['business', 'entertainment-arts', 'science-environment', 'technology']

data = []
labels = []

for cat_idx, cat in enumerate(cats):
    cat_dir = f'{ data_dir }/{ cat }'
    for file_name in os.listdir(cat_dir):
        with open(f'{ cat_dir }/{ file_name }') as file:
            article = file.read()
            data.append(article)
            labels.append(cat_idx)

In [3]:
test_sizes = [0.2, 0.3]

# split data into training set and testing set based on the proportion of testing set
datasets = [ train_test_split(data, np.array(labels), test_size=test_size)
             for test_size in test_sizes ]

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')

for idx, dataset in enumerate(datasets):
    X_train, X_test, y_train, y_test = dataset
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    datasets[idx] = X_train, X_test, y_train, y_test

# Models

In [5]:
# make a specified model with desired parameters
def get_model(model_type, params):
    if model_type == 'knn':
        return KNeighborsClassifier(**params)
    elif model_type == 'rf':
        return RandomForestClassifier(criterion='gini', **params)
    elif model_type == 'svm':
        return SVC(kernel='rbf', **params)
    elif model_type == 'mlp':
        return MLPClassifier(**params)
    else:
        return None

# Validation & Results

In [6]:
# display a confusion matrix and the classification report
def show_performance(y_true, y_pred):
    c_matrix = confusion_matrix(y_true, y_pred)
    c_table = pd.DataFrame(c_matrix)
    c_table.columns.name = 'truth\pred'
    display(c_table)
    
    report = classification_report(y_true, y_pred)
    print(report)

In [7]:
def show_cross_validate_report(res):
    report = pd.DataFrame({
        'fit_time': res['fit_time'],
        'score_time': res['score_time'],
        'test_score': res['test_score'],
    })
    display(report)

In [8]:
# train a model with 5-fold cross validation and validate the best model with the testing set
def train_model(model_type, param_grid, datasets):
    for test_size, dataset in zip(test_sizes, datasets):
        display(Markdown(f'### Test size: { test_size }'))
        X_train, X_test, y_train, y_test = dataset

        for params in param_grid:
            display(Markdown(f'#### { params }'))
            model = get_model(model_type, params)
            res = cross_validate(model, X_train, y_train, return_estimator=True)

            display(Markdown('#### Training Performance:'))
            show_cross_validate_report(res)

            best_model = res['estimator'][res['test_score'].argmax()]
            y_pred = best_model.predict(X_test)

            display(Markdown('#### Testing Performance:'))
            show_performance(y_test, y_pred)

## KNN

In [9]:
param_grid = ParameterGrid({
    'n_neighbors': [5, 10, 15]
})
train_model('knn', param_grid, datasets)

### Test size: 0.2

#### {'n_neighbors': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.010232,0.01431,0.84375
1,0.012789,0.010397,0.78125
2,0.009912,0.010636,0.8125
3,0.011699,0.010509,0.90625
4,0.012049,0.010665,0.75


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,2,9,0,0
2,0,0,9,0
3,1,0,1,8


              precision    recall  f1-score   support

           0       0.70      0.70      0.70        10
           1       0.90      0.82      0.86        11
           2       0.90      1.00      0.95         9
           3       0.80      0.80      0.80        10

    accuracy                           0.82        40
   macro avg       0.82      0.83      0.83        40
weighted avg       0.82      0.82      0.82        40



#### {'n_neighbors': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.011684,0.01079,0.84375
1,0.009975,0.010558,0.78125
2,0.012353,0.011264,0.75
3,0.012121,0.0108,0.875
4,0.011892,0.010786,0.78125


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,3,8,0,0
2,0,0,9,0
3,3,0,1,6


              precision    recall  f1-score   support

           0       0.54      0.70      0.61        10
           1       0.89      0.73      0.80        11
           2       0.90      1.00      0.95         9
           3       0.75      0.60      0.67        10

    accuracy                           0.75        40
   macro avg       0.77      0.76      0.76        40
weighted avg       0.77      0.75      0.75        40



#### {'n_neighbors': 15}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.012788,0.011695,0.84375
1,0.01163,0.011082,0.71875
2,0.011158,0.010861,0.75
3,0.011327,0.010649,0.8125
4,0.012255,0.010653,0.8125


#### Testing Performance:

truth\pred,0,1,2,3
0,9,1,0,0
1,3,8,0,0
2,0,0,9,0
3,1,0,2,7


              precision    recall  f1-score   support

           0       0.69      0.90      0.78        10
           1       0.89      0.73      0.80        11
           2       0.82      1.00      0.90         9
           3       1.00      0.70      0.82        10

    accuracy                           0.82        40
   macro avg       0.85      0.83      0.83        40
weighted avg       0.85      0.82      0.82        40



### Test size: 0.3

#### {'n_neighbors': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.010688,0.009434,0.821429
1,0.009589,0.009248,0.75
2,0.00961,0.009314,0.785714
3,0.009642,0.009324,0.75
4,0.009639,0.009023,0.821429


#### Testing Performance:

truth\pred,0,1,2,3
0,19,0,0,3
1,0,16,0,0
2,0,0,9,0
3,5,0,1,7


              precision    recall  f1-score   support

           0       0.79      0.86      0.83        22
           1       1.00      1.00      1.00        16
           2       0.90      1.00      0.95         9
           3       0.70      0.54      0.61        13

    accuracy                           0.85        60
   macro avg       0.85      0.85      0.85        60
weighted avg       0.84      0.85      0.84        60



#### {'n_neighbors': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.005807,0.00947,0.75
1,0.005712,0.009547,0.821429
2,0.005584,0.009292,0.75
3,0.005955,0.009424,0.75
4,0.00545,0.00978,0.75


#### Testing Performance:

truth\pred,0,1,2,3
0,19,0,1,2
1,0,16,0,0
2,0,0,9,0
3,3,0,1,9


              precision    recall  f1-score   support

           0       0.86      0.86      0.86        22
           1       1.00      1.00      1.00        16
           2       0.82      1.00      0.90         9
           3       0.82      0.69      0.75        13

    accuracy                           0.88        60
   macro avg       0.88      0.89      0.88        60
weighted avg       0.88      0.88      0.88        60



#### {'n_neighbors': 15}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.00974,0.008871,0.714286
1,0.009498,0.009081,0.821429
2,0.009432,0.008934,0.785714
3,0.009443,0.009038,0.714286
4,0.009315,0.008951,0.785714


#### Testing Performance:

truth\pred,0,1,2,3
0,19,0,0,3
1,0,16,0,0
2,0,0,9,0
3,3,1,1,8


              precision    recall  f1-score   support

           0       0.86      0.86      0.86        22
           1       0.94      1.00      0.97        16
           2       0.90      1.00      0.95         9
           3       0.73      0.62      0.67        13

    accuracy                           0.87        60
   macro avg       0.86      0.87      0.86        60
weighted avg       0.86      0.87      0.86        60



## Random Forest

In [10]:
param_grid = ParameterGrid({
    'min_samples_leaf': [1, 5, 10]
})
train_model('rf', param_grid, datasets)

### Test size: 0.2

#### {'min_samples_leaf': 1}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.276639,0.012397,0.78125
1,0.259322,0.011573,0.75
2,0.2604,0.0118,0.9375
3,0.262405,0.011333,0.9375
4,0.265674,0.011716,0.875


#### Testing Performance:

truth\pred,0,1,2,3
0,6,1,0,3
1,1,10,0,0
2,0,0,9,0
3,0,0,0,10


              precision    recall  f1-score   support

           0       0.86      0.60      0.71        10
           1       0.91      0.91      0.91        11
           2       1.00      1.00      1.00         9
           3       0.77      1.00      0.87        10

    accuracy                           0.88        40
   macro avg       0.88      0.88      0.87        40
weighted avg       0.88      0.88      0.87        40



#### {'min_samples_leaf': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.220069,0.011112,0.84375
1,0.21719,0.011283,0.8125
2,0.21475,0.011189,0.84375
3,0.224544,0.011607,0.96875
4,0.222323,0.011112,0.75


#### Testing Performance:

truth\pred,0,1,2,3
0,5,2,0,3
1,0,11,0,0
2,0,0,9,0
3,1,2,1,6


              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.73      1.00      0.85        11
           2       0.90      1.00      0.95         9
           3       0.67      0.60      0.63        10

    accuracy                           0.78        40
   macro avg       0.78      0.78      0.76        40
weighted avg       0.78      0.78      0.76        40



#### {'min_samples_leaf': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.205915,0.010879,0.6875
1,0.214118,0.011292,0.65625
2,0.211463,0.011906,0.6875
3,0.198364,0.011231,0.78125
4,0.198464,0.011458,0.8125


#### Testing Performance:

truth\pred,0,1,2,3
0,4,3,0,3
1,0,10,0,1
2,0,0,9,0
3,0,3,0,7


              precision    recall  f1-score   support

           0       1.00      0.40      0.57        10
           1       0.62      0.91      0.74        11
           2       1.00      1.00      1.00         9
           3       0.64      0.70      0.67        10

    accuracy                           0.75        40
   macro avg       0.82      0.75      0.74        40
weighted avg       0.81      0.75      0.74        40



### Test size: 0.3

#### {'min_samples_leaf': 1}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.237412,0.011259,0.785714
1,0.238068,0.011496,0.892857
2,0.240523,0.012182,0.821429
3,0.241796,0.011094,0.821429
4,0.233787,0.010994,0.892857


#### Testing Performance:

truth\pred,0,1,2,3
0,11,0,1,10
1,0,14,1,1
2,0,0,9,0
3,0,0,1,12


              precision    recall  f1-score   support

           0       1.00      0.50      0.67        22
           1       1.00      0.88      0.93        16
           2       0.75      1.00      0.86         9
           3       0.52      0.92      0.67        13

    accuracy                           0.77        60
   macro avg       0.82      0.82      0.78        60
weighted avg       0.86      0.77      0.77        60



#### {'min_samples_leaf': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.209666,0.010989,0.785714
1,0.209574,0.011204,0.857143
2,0.220256,0.013679,0.785714
3,0.217029,0.010942,0.642857
4,0.2054,0.010777,0.892857


#### Testing Performance:

truth\pred,0,1,2,3
0,13,2,1,6
1,0,12,2,2
2,0,0,9,0
3,0,0,3,10


              precision    recall  f1-score   support

           0       1.00      0.59      0.74        22
           1       0.86      0.75      0.80        16
           2       0.60      1.00      0.75         9
           3       0.56      0.77      0.65        13

    accuracy                           0.73        60
   macro avg       0.75      0.78      0.73        60
weighted avg       0.81      0.73      0.74        60



#### {'min_samples_leaf': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.19289,0.011374,0.5
1,0.190844,0.010893,0.642857
2,0.188658,0.010741,0.428571
3,0.190086,0.010827,0.642857
4,0.19384,0.012113,0.714286


#### Testing Performance:

truth\pred,0,1,2,3
0,0,7,3,12
1,0,13,2,1
2,0,0,9,0
3,0,2,4,7


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.59      0.81      0.68        16
           2       0.50      1.00      0.67         9
           3       0.35      0.54      0.42        13

    accuracy                           0.48        60
   macro avg       0.36      0.59      0.44        60
weighted avg       0.31      0.48      0.37        60



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [11]:
param_grid = ParameterGrid({
    'C': [1, 5, 10]
})
train_model('svm', param_grid, datasets)

### Test size: 0.2

#### {'C': 1}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.132141,0.079984,0.875
1,0.132724,0.078518,0.8125
2,0.135332,0.078091,0.84375
3,0.131445,0.083116,0.875
4,0.135578,0.084623,0.75


#### Testing Performance:

truth\pred,0,1,2,3
0,4,2,0,4
1,0,11,0,0
2,0,0,9,0
3,0,1,2,7


              precision    recall  f1-score   support

           0       1.00      0.40      0.57        10
           1       0.79      1.00      0.88        11
           2       0.82      1.00      0.90         9
           3       0.64      0.70      0.67        10

    accuracy                           0.78        40
   macro avg       0.81      0.77      0.75        40
weighted avg       0.81      0.78      0.75        40



#### {'C': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.132529,0.079252,0.875
1,0.134012,0.07888,0.84375
2,0.133669,0.080154,0.84375
3,0.133782,0.076736,0.96875
4,0.134137,0.080883,0.875


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,0,11,0,0
2,0,0,9,0
3,0,0,1,9


              precision    recall  f1-score   support

           0       1.00      0.70      0.82        10
           1       0.92      1.00      0.96        11
           2       0.90      1.00      0.95         9
           3       0.82      0.90      0.86        10

    accuracy                           0.90        40
   macro avg       0.91      0.90      0.90        40
weighted avg       0.91      0.90      0.90        40



#### {'C': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.130995,0.079677,0.875
1,0.133357,0.07622,0.84375
2,0.131062,0.074838,0.84375
3,0.12868,0.081359,0.96875
4,0.131736,0.082345,0.875


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,0,11,0,0
2,0,0,9,0
3,0,0,1,9


              precision    recall  f1-score   support

           0       1.00      0.70      0.82        10
           1       0.92      1.00      0.96        11
           2       0.90      1.00      0.95         9
           3       0.82      0.90      0.86        10

    accuracy                           0.90        40
   macro avg       0.91      0.90      0.90        40
weighted avg       0.91      0.90      0.90        40



### Test size: 0.3

#### {'C': 1}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.112598,0.027415,0.607143
1,0.088784,0.027185,0.857143
2,0.086754,0.027898,0.75
3,0.087862,0.027182,0.678571
4,0.088337,0.028805,0.785714


#### Testing Performance:

truth\pred,0,1,2,3
0,8,1,5,8
1,0,15,0,1
2,0,0,9,0
3,0,1,3,9


              precision    recall  f1-score   support

           0       1.00      0.36      0.53        22
           1       0.88      0.94      0.91        16
           2       0.53      1.00      0.69         9
           3       0.50      0.69      0.58        13

    accuracy                           0.68        60
   macro avg       0.73      0.75      0.68        60
weighted avg       0.79      0.68      0.67        60



#### {'C': 5}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.087439,0.02757,0.75
1,0.0857,0.027109,0.857143
2,0.0841,0.027051,0.75
3,0.08465,0.027746,0.714286
4,0.084424,0.027408,0.821429


#### Testing Performance:

truth\pred,0,1,2,3
0,14,1,1,6
1,0,16,0,0
2,0,0,9,0
3,0,1,2,10


              precision    recall  f1-score   support

           0       1.00      0.64      0.78        22
           1       0.89      1.00      0.94        16
           2       0.75      1.00      0.86         9
           3       0.62      0.77      0.69        13

    accuracy                           0.82        60
   macro avg       0.82      0.85      0.82        60
weighted avg       0.85      0.82      0.81        60



#### {'C': 10}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,0.086067,0.027058,0.75
1,0.088209,0.028067,0.857143
2,0.084602,0.027769,0.75
3,0.085048,0.026573,0.714286
4,0.084978,0.027117,0.821429


#### Testing Performance:

truth\pred,0,1,2,3
0,14,1,1,6
1,0,16,0,0
2,0,0,9,0
3,0,1,2,10


              precision    recall  f1-score   support

           0       1.00      0.64      0.78        22
           1       0.89      1.00      0.94        16
           2       0.75      1.00      0.86         9
           3       0.62      0.77      0.69        13

    accuracy                           0.82        60
   macro avg       0.82      0.85      0.82        60
weighted avg       0.85      0.82      0.81        60



## MLP

In [12]:
param_grid = ParameterGrid({
    'hidden_layer_sizes': [256, 512, 1024]
})
train_model('mlp', param_grid, datasets)

### Test size: 0.2

#### {'hidden_layer_sizes': 256}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,11.662338,0.007427,0.875
1,12.324114,0.007385,0.9375
2,11.536946,0.007498,0.875
3,11.541674,0.007352,0.90625
4,11.544497,0.007532,0.90625


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,0,11,0,0
2,0,0,9,0
3,1,0,0,9


              precision    recall  f1-score   support

           0       0.88      0.70      0.78        10
           1       0.92      1.00      0.96        11
           2       1.00      1.00      1.00         9
           3       0.82      0.90      0.86        10

    accuracy                           0.90        40
   macro avg       0.90      0.90      0.90        40
weighted avg       0.90      0.90      0.90        40



#### {'hidden_layer_sizes': 512}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,17.432833,0.012781,0.875
1,17.573163,0.013076,0.9375
2,17.539508,0.012541,0.875
3,18.089976,0.012536,0.90625
4,18.028255,0.012611,0.90625


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,0,11,0,0
2,0,0,9,0
3,1,2,1,6


              precision    recall  f1-score   support

           0       0.88      0.70      0.78        10
           1       0.79      1.00      0.88        11
           2       0.90      1.00      0.95         9
           3       0.75      0.60      0.67        10

    accuracy                           0.82        40
   macro avg       0.83      0.83      0.82        40
weighted avg       0.82      0.82      0.82        40



#### {'hidden_layer_sizes': 1024}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,22.945317,0.02606,0.875
1,24.498969,0.02594,0.9375
2,24.189779,0.025495,0.875
3,24.035858,0.025267,0.90625
4,24.491957,0.030793,0.90625


#### Testing Performance:

truth\pred,0,1,2,3
0,7,1,0,2
1,0,11,0,0
2,0,0,9,0
3,1,2,1,6


              precision    recall  f1-score   support

           0       0.88      0.70      0.78        10
           1       0.79      1.00      0.88        11
           2       0.90      1.00      0.95         9
           3       0.75      0.60      0.67        10

    accuracy                           0.82        40
   macro avg       0.83      0.83      0.82        40
weighted avg       0.82      0.82      0.82        40



### Test size: 0.3

#### {'hidden_layer_sizes': 256}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,9.918322,0.007482,0.857143
1,10.641932,0.007289,0.857143
2,9.663203,0.007389,0.75
3,9.906511,0.007456,0.892857
4,10.249137,0.007164,0.964286


#### Testing Performance:

truth\pred,0,1,2,3
0,16,1,1,4
1,0,16,0,0
2,0,0,9,0
3,0,1,2,10


              precision    recall  f1-score   support

           0       1.00      0.73      0.84        22
           1       0.89      1.00      0.94        16
           2       0.75      1.00      0.86         9
           3       0.71      0.77      0.74        13

    accuracy                           0.85        60
   macro avg       0.84      0.87      0.85        60
weighted avg       0.87      0.85      0.85        60



#### {'hidden_layer_sizes': 512}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,12.874886,0.012899,0.857143
1,15.657757,0.012748,0.857143
2,16.025382,0.012609,0.75
3,16.812493,0.012414,0.892857
4,16.300771,0.012671,0.964286


#### Testing Performance:

truth\pred,0,1,2,3
0,16,1,1,4
1,0,16,0,0
2,0,0,9,0
3,0,1,1,11


              precision    recall  f1-score   support

           0       1.00      0.73      0.84        22
           1       0.89      1.00      0.94        16
           2       0.82      1.00      0.90         9
           3       0.73      0.85      0.79        13

    accuracy                           0.87        60
   macro avg       0.86      0.89      0.87        60
weighted avg       0.89      0.87      0.86        60



#### {'hidden_layer_sizes': 1024}

#### Training Performance:

Unnamed: 0,fit_time,score_time,test_score
0,20.549937,0.025915,0.857143
1,22.10515,0.026341,0.857143
2,22.933676,0.026634,0.75
3,22.570739,0.026642,0.892857
4,22.303413,0.049539,0.964286


#### Testing Performance:

truth\pred,0,1,2,3
0,16,1,1,4
1,0,16,0,0
2,0,0,9,0
3,0,1,1,11


              precision    recall  f1-score   support

           0       1.00      0.73      0.84        22
           1       0.89      1.00      0.94        16
           2       0.82      1.00      0.90         9
           3       0.73      0.85      0.79        13

    accuracy                           0.87        60
   macro avg       0.86      0.89      0.87        60
weighted avg       0.89      0.87      0.86        60

