In [None]:
import pickle
import TextEmbedding 
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer



In [None]:
X_train = pickle.load(open('FinalData\X_train.pkl', 'rb'))
X_test = pickle.load(open('FinalData\X_test.pkl', 'rb'))
y_train = pickle.load(open('FinalData\y_train.pkl', 'rb'))
y_test = pickle.load(open('FinalData\y_test.pkl', 'rb'))

X_train_w2v = pickle.load(open('FinalData\X_train_w2v.pkl', 'rb'))
X_test_w2v = pickle.load(open('FinalData\X_test_w2v.pkl', 'rb'))
y_train_w2v = pickle.load(open('FinalData\y_train_w2v.pkl', 'rb'))
y_test_w2v = pickle.load(open('FinalData\y_test_w2v.pkl', 'rb'))
tfidf_embedding_train, tfidf_embedding_test = TextEmbedding.tfidf(X_train,X_test)
BOW_embedding_train, BOW_embedding_test = TextEmbedding.BagOfWord(X_train,X_test)
w2v_embedding_train, w2v_embedding_test = TextEmbedding.Word2Vector(X_train,X_test)


In [None]:
list_topic ={
    "thoi-su": 0,
    "kinh-doanh": 1,
    "khoa-hoc": 2,
    "giai-tri": 3,
    "the-thao": 4,
    "phap-luat": 5, 
    "giao-duc": 6,
    "suc-khoe": 7,
    "doi-song": 8,
    "du-lich":  9
}
for i in range (len(y_train)):
    for x, y in list_topic.items():
        if y_train[i] == x:
            y_train[i] = y 
for i in range (len(y_test)):
    for x, y in list_topic.items():
        if y_test[i] == x:
            y_test[i] = y

In [None]:
def svm_model(X_train, y_train, X_test, y_test, kernel, C):
   
    svc = SVC(kernel=kernel, degree=3, C=C).fit(X_train, y_train)
    test_predictions = svc.predict(X_test)
    
    f1 = f1_score(y_test, test_predictions, average='macro')
    recall = recall_score(y_test, test_predictions, average='macro')
    accuracy = accuracy_score(y_test, test_predictions)
    return accuracy, recall, f1

GET BEST SET OF PARAMETERS (USING BOW)

In [None]:
values = ['rbf', 'linear', 'poly', 'sigmoid']  
accuracy_values = []
recall_values = []
f1_values = []

for value in values:
    accuracy, recall, f1 = SVC(BOW_embedding_train, y_train, BOW_embedding_test, y_test, value, 1)
    accuracy_values.append(accuracy)
    recall_values.append(recall)
    f1_values.append(f1)


bar_width = 0.2
print(accuracy_values)
print(recall_values)
print(f1_values)

bar_positions = np.arange(len(values))

plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')
plt.bar(bar_positions + bar_width, recall_values, width=bar_width, color='blue', label='Recall')
plt.bar(bar_positions + 2*bar_width, f1_values, width=bar_width, color='green', label='F1 Score')

plt.xlabel('kernel')
plt.ylabel('Accuracy')
plt.title('SVM performance with C = 1')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

plt.xticks(bar_positions + bar_width, values)
plt.yticks(np.arange(0, 1.1, 0.1))

plt.show()



In [None]:
def plot_grid_search(param, list_params, result):
    values = [params[param] for params in list_params]

    accuracy_values = result

    bar_width = 0.2
    

    bar_positions = np.arange(len(values))

    plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')

    plt.xlabel('C')
    plt.ylabel('Accuracy')
    plt.title('SVM Performance Accuracy with kernel = sigmoid')
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

    plt.xticks(bar_positions, values)
    plt.yticks(np.arange(0, 1.1, 0.1))

    plt.show()

In [None]:
def GridSearchCV_SVM():
    param_grid = {
        # 'C': [0.1, 1, 10, 100],
        # # 'gamma': [0.1, 0.01, 0.001],
        # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
        'C': [0.01, 0.1, 1, 2, 5, 10, 100],
    }
    # f1_scorer = make_scorer(f1_score, average='micro')
    # recall_scorer = make_scorer(recall_score, average='micro')
    svm_model = SVC(kernel='sigmoid')
    grid_search = GridSearchCV(svm_model, param_grid, cv=5)
    grid_search.fit(BOW_embedding_train , y_train)

    print(grid_search.cv_results_['mean_test_score'])
    print(grid_search.cv_results_['params'])
    plot_grid_search('C', grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score'])
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= f1_scorer, cv=5)
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= recall_scorer, cv=5)

GridSearchCV_SVM()
    

GET BEST SET OF PARAMETERS (USING TFIDF)

In [None]:
values = ['rbf', 'linear', 'poly', 'sigmoid']  
accuracy_values = []
recall_values = []
f1_values = []

for value in values:
    accuracy, recall, f1 = SVC(tfidf_embedding_train, y_train, tfidf_embedding_test, y_test, value, 1)
    accuracy_values.append(accuracy)
    recall_values.append(recall)
    f1_values.append(f1)


bar_width = 0.2
print(accuracy_values)
print(recall_values)
print(f1_values)

bar_positions = np.arange(len(values))

plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')
plt.bar(bar_positions + bar_width, recall_values, width=bar_width, color='blue', label='Recall')
plt.bar(bar_positions + 2*bar_width, f1_values, width=bar_width, color='green', label='F1 Score')

plt.xlabel('kernel')
plt.ylabel('Accuracy')
plt.title('SVM performance with C = 1')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

plt.xticks(bar_positions + bar_width, values)
plt.yticks(np.arange(0, 1.1, 0.1))

plt.show()



In [None]:
def plot_grid_search(param, list_params, result):
    values = [params[param] for params in list_params]

    accuracy_values = result

    bar_width = 0.2
    

    bar_positions = np.arange(len(values))

    plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')

    plt.xlabel('C')
    plt.ylabel('Accuracy')
    plt.title('SVM Performance Accuracy with kernel = sigmoid')
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

    plt.xticks(bar_positions, values)
    plt.yticks(np.arange(0, 1.1, 0.1))

    plt.show()

In [None]:
def GridSearchCV_SVM():
    param_grid = {
        # 'C': [0.1, 1, 10, 100],
        # # 'gamma': [0.1, 0.01, 0.001],
        # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
        'C': [0.01, 0.1, 1, 2, 5, 10, 100],
    }
    # f1_scorer = make_scorer(f1_score, average='micro')
    # recall_scorer = make_scorer(recall_score, average='micro')
    svm_model = SVC(kernel='sigmoid')
    grid_search = GridSearchCV(svm_model, param_grid, cv=5)
    grid_search.fit(tfidf_embedding_train , y_train)

    print(grid_search.cv_results_['mean_test_score'])
    print(grid_search.cv_results_['params'])
    plot_grid_search('C', grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score'])
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= f1_scorer, cv=5)
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= recall_scorer, cv=5)

GridSearchCV_SVM()
    

GET BEST SET OF PARAMETERS (USING WORD2VEC)

In [None]:
values = ['rbf', 'linear', 'poly', 'sigmoid']  
accuracy_values = []
recall_values = []
f1_values = []

for value in values:
    accuracy, recall, f1 = SVC(w2v_embedding_train, y_train, w2v_embedding_test, y_test, value, 1)
    accuracy_values.append(accuracy)
    recall_values.append(recall)
    f1_values.append(f1)


bar_width = 0.2
print(accuracy_values)
print(recall_values)
print(f1_values)

bar_positions = np.arange(len(values))

plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')
plt.bar(bar_positions + bar_width, recall_values, width=bar_width, color='blue', label='Recall')
plt.bar(bar_positions + 2*bar_width, f1_values, width=bar_width, color='green', label='F1 Score')

plt.xlabel('kernel')
plt.ylabel('Accuracy')
plt.title('SVM performance with C = 1')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

plt.xticks(bar_positions + bar_width, values)
plt.yticks(np.arange(0, 1.1, 0.1))

plt.show()



In [None]:
def plot_grid_search(param, list_params, result):
    values = [params[param] for params in list_params]

    accuracy_values = result

    bar_width = 0.2
    

    bar_positions = np.arange(len(values))

    plt.bar(bar_positions, accuracy_values, width=bar_width, color='red', label='Accuracy')

    plt.xlabel('C')
    plt.ylabel('Accuracy')
    plt.title('SVM Performance Accuracy with kernel = sigmoid')
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.22), ncol=3)

    plt.xticks(bar_positions, values)
    plt.yticks(np.arange(0, 1.1, 0.1))

    plt.show()

In [None]:
def GridSearchCV_SVM():
    param_grid = {
        # 'C': [0.1, 1, 10, 100],
        # # 'gamma': [0.1, 0.01, 0.001],
        # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
        'C': [0.01, 0.1, 1, 2, 5, 10, 100],
    }
    # f1_scorer = make_scorer(f1_score, average='micro')
    # recall_scorer = make_scorer(recall_score, average='micro')
    svm_model = SVC(kernel='sigmoid')
    grid_search = GridSearchCV(svm_model, param_grid, cv=5)
    grid_search.fit(tfidf_embedding_train , y_train)

    print(grid_search.cv_results_['mean_test_score'])
    print(grid_search.cv_results_['params'])
    plot_grid_search('C', grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score'])
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= f1_scorer, cv=5)
    # grid_search = GridSearchCV(svm_model, param_grid, scoring= recall_scorer, cv=5)

GridSearchCV_SVM()
    

COMPARE BOW, TFIDF, WORD2VEC WITH BEST OF SET PARAMETERS 