In [47]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.svm import SVC 
from imblearn.over_sampling import RandomOverSampler

In [48]:
#Define File Path
vec20avg_path = "./vec_data/vec20_avg.npz"
vec25avg_path = "./vec_data/vec25_avg.npz"
vec30avg_path = "./vec_data/vec30_avg.npz"
vec35avg_path = "./vec_data/vec35_avg.npz"
vec20sum_path = "./vec_data/vec20_sum.npz"
vec20sum_path = "./vec_data/vec20_sum.npz"
vec25sum_path = "./vec_data/vec25_sum.npz"
vec30sum_path = "./vec_data/vec30_sum.npz"
vec35sum_path = "./vec_data/vec35_sum.npz"
# freq_stance_labels = "./vec_data/freq_stance_labels.npz"
# oh_stance_labels = "./vec_data/oh_stance_labels.npz"
le_stance_labels = "./vec_data/le_stance_labels.npz"

In [49]:
def load_npz_file(filepath):
    # Load the numpy array from the .npz file
    with np.load(filepath, allow_pickle=True) as data:
        for key in data.keys():
            arr = data[key]
            break
    return arr

In [50]:
vec20avg = load_npz_file(vec20avg_path)
vec25avg = load_npz_file(vec25avg_path)
vec30avg = load_npz_file(vec30avg_path)
vec35avg = load_npz_file(vec35avg_path)
vec20sum = load_npz_file(vec20sum_path)
vec20sum = load_npz_file(vec20sum_path)
vec25sum = load_npz_file(vec25sum_path)
vec30sum = load_npz_file(vec30sum_path)
vec35sum = load_npz_file(vec35sum_path)
# freq_label = load_npz_file(freq_stance_labels)
# oh_label = load_npz_file(oh_stance_labels)
le_label = load_npz_file(le_stance_labels)

In [51]:
#指派實際要使用的Data與Label
# data = vec30sum
# data = vec30avg
# data = vec20sum
data = vec20avg
# data = vec25sum
# data = vec25avg
# data = vec35sum
# data = vec35avg
# label = np.argmax(oh_label, axis=1)
# label = oh_label
# label = freq_label
label = le_label

In [52]:
print(data.shape)
print(label.shape)
print(np.unique(label))
print(np.unique(label, return_counts=True))

(445, 1, 300)
(445,)
[0 1 2 3]
(array([0, 1, 2, 3]), array([ 11, 201,  74, 159], dtype=int64))


In [53]:
# Data Augmentation using Oversampling
# Define the oversampling ratio for each class
# ratio0 = {0: 11, 1: 201, 2: 74, 3: 159}
# ratio1 = {0: 24, 1: 201, 2: 78, 3: 168}
# ratio2 = {0: 53, 1: 201, 2: 89, 3: 191}
# ratio3 = {0: 86, 1: 201, 2: 115, 3: 172}
# ratio4 = {0: 144, 1: 201, 2: 179, 3: 194}
oversample_ratio = {0: 150, 1: 201, 2: 175, 3: 183}
# Initialize the oversampler
oversampler = RandomOverSampler(sampling_strategy=oversample_ratio)
# Reshape your data to a 2D matrix of shape (n_samples, n_features)
X = data.reshape(-1, 300)
# Apply oversampling to X and y
X_resampled, y_resampled = oversampler.fit_resample(X, label)
# Reshape X back to its original shape
# X_resampled = X_resampled.reshape(-1, 1, 300)

In [54]:
# ratio = list(oversample_ratio.values())
# ratio_scale = [round(ratio[0]/sum(ratio),2), round(ratio[1]/sum(ratio),2), round(ratio[2]/sum(ratio),2), round(ratio[3]/sum(ratio),2)]
# print(ratio_scale)

In [55]:
# print(X_resampled.shape)
# print(X_resampled)

In [56]:
# print(y_resampled.shape)
# print(y_resampled)

In [57]:
# Define the number of folds for k-fold cross-validation
num_folds = 5
# Initialize the k-fold cross-validator
skf = StratifiedKFold(n_splits=num_folds, shuffle=True)

# SVC parameters
kernels = list(['linear', 'rbf', 'poly', 'sigmoid'])
c_optins = list([1, 5, 20, 25, 30, 35, 40, 45, 50])
gammas = list([0.01, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])

# Metrics reocrds
model_ac_list = []
model_pc_list = []
model_rc_list = []
model_f1_list = []

In [58]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_resampled, y_resampled)):
    print(f'Fold {fold + 1}')
    X_train_fold = X_resampled[train_idx]
    y_train_fold = y_resampled[train_idx]
    X_val_fold = X_resampled[val_idx]
    y_val_fold = y_resampled[val_idx]

    grid_clf = SVC()
    param_grid = dict(kernel=kernels, C=c_optins, gamma=gammas)
    grid = GridSearchCV(grid_clf, param_grid, cv=10, n_jobs=-1)
    grid.fit(X_train_fold, y_train_fold)
    best = grid.best_params_
    print(best)

    clf = SVC(C=best['C'], gamma=best['gamma'], kernel=best['kernel']) 
    clf.fit(X_train_fold, y_train_fold)
    y_pred = clf.predict(X_val_fold)

    ac = accuracy_score(y_val_fold, y_pred)
    pc = precision_score(y_pred, y_val_fold, average='macro')
    rc = recall_score(y_pred, y_val_fold, average='macro')
    f1 = f1_score(y_pred, y_val_fold, average='macro')
    model_ac_list.append(ac)
    model_pc_list.append(pc)
    model_rc_list.append(rc)
    model_f1_list.append(f1)

Fold 1
{'C': 5, 'gamma': 0.8, 'kernel': 'rbf'}
Fold 2
{'C': 5, 'gamma': 0.8, 'kernel': 'rbf'}
Fold 3
{'C': 1, 'gamma': 0.7, 'kernel': 'rbf'}
Fold 4
{'C': 5, 'gamma': 0.8, 'kernel': 'rbf'}
Fold 5
{'C': 20, 'gamma': 0.4, 'kernel': 'rbf'}


In [59]:
# print(model_f1_list)
# print(model_ac_list)
# print(model_pc_list)
# print(model_rc_list)

In [60]:
Avg_ac = sum(model_ac_list)/len(model_ac_list)
Avg_pc = sum(model_pc_list)/len(model_pc_list)
Avg_rc = sum(model_rc_list)/len(model_rc_list)
Avg_f1 = sum(model_f1_list)/len(model_f1_list)

In [61]:
print('Avg Model Accuracy: {:.2%}'.format(Avg_ac))
print('Avg Model Precision: {:.2%}'.format(Avg_pc))
print('Avg Model Recall: {:.2%}'.format(Avg_rc))
print('Avg Model F1-score: {:.2%}'.format(Avg_f1))

Avg Model Accuracy: 93.09%
Avg Model Precision: 93.54%
Avg Model Recall: 93.76%
Avg Model F1-score: 93.52%
