In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

In [3]:
def evaluate_svm(model, X_test, Y_test):
    '''Testing a trained SVM model with the test set and labels'''
    pred = model.predict(X_test)
    class_report = classification_report(Y_test, pred, digits=4, zero_division=True)
    return class_report

In [4]:
'''Reading in data as a pandas dataframe'''
data = pd.read_csv('sem-pmb_4_0_0-gold.csv')

In [5]:
'''Splitting data into 80% train and 20% test data'''
X_train, X_test, Y_train, Y_test = train_test_split(data['token'], data['semtag'], test_size=0.20, random_state=1234)

In [6]:
'''Training model'''
SVM = train_svm(X_train, Y_train, X_test)

In [7]:
'''Testing model'''
print(evaluate_svm(SVM[0], SVM[1], Y_test))

              precision    recall  f1-score   support

         ALT     0.8750    0.9825    0.9256        57
         AND     0.9706    0.5690    0.7174       116
         APX     0.7273    0.4444    0.5517        18
         ART     0.5000    0.1765    0.2609        17
         BOT     1.0000    1.0000    1.0000         2
         BUT     0.9444    0.8500    0.8947        20
         CLO     0.8400    0.6364    0.7241        33
         COL     0.8788    0.9062    0.8923        32
         CON     0.9414    0.8244    0.8790      1794
         COO     1.0000    0.0000    0.0000        28
         CTC     1.0000    0.7500    0.8571         4
         DEF     0.9943    0.4302    0.6005      1611
         DEG     0.7119    0.7778    0.7434        54
         DIS     0.9806    0.1036    0.1874       975
         DOM     1.0000    0.1111    0.2000         9
         DOW     1.0000    1.0000    1.0000         2
         DST     0.5882    1.0000    0.7407        40
         EFS     0.4000    

#Gridsearch for finding best parameters

In [20]:
def gridsearch(Xtrain, Ytrain, Xdev, Ydev):
    """Find the best parameters for each feature,
    and print its classification report"""
    model = svm.SVC(kernel='linear')

    """Word features"""
    vec = TfidfVectorizer(analyzer='word')    
    parameters = {'C':[1, 10]}
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']

    print("Running grid search....")
    run = 0
    for k in kernel:
        run +=1
        model = svm.SVC(kernel=k)
        vec.fit(Xtrain, Xdev)
        x_train = vec.transform(Xtrain)
        x_dev = vec.transform(Xdev)
        model.fit(x_train, Ytrain)
        
        grid_svm = GridSearchCV(model,
                                param_grid=parameters,
                                scoring='f1_macro',
                                n_jobs=-1)

        grid_svm.fit(x_train, Ytrain)
        pred = grid_svm.predict(x_dev)
        print("Run {}\{}".format(run, len(kernel)))
        print("Best model ran with kernel: {} and parameter: {}\n".format(k, grid_svm.best_params_))
        print(classification_report(Ydev, pred, digits=4), '\n')

In [None]:
gridsearch(X_train, Y_train, X_test, Y_test)