# Predicate-based classifiers

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [2]:
df1 = pd.read_csv('./data/ohsumed_C14_C23_1grams.csv')
df1.head(3)

Unnamed: 0.1,Unnamed: 0,file_name,C14,C23,Y,tokens
0,0,42408,0,0,0,evaluation 99tcm technegas ventilation scintig...
1,1,45847,0,0,0,cardiorespiratory effect pressure controlled v...
2,2,43982,0,1,0,lung function school-age child mild lower resp...


## Predicate C10: Nervous System Diseases

In [3]:
def get_data(predicate):
    X_pos = df1.loc[df1[predicate] == 1]['tokens'].values
    pos_num = X_pos.size
    X_neg = df1.loc[df1[predicate] == 0]['tokens'].values[:pos_num]
    X = np.append(X_pos, X_neg)
    y = np.append(np.ones(pos_num), np.zeros(pos_num))
    
    return X, y

In [None]:
import matplotlib.pyplot as plt

predicate = 'C14'
k = 10

X_train, y_train = get_data(predicate)

pipeline  = Pipeline([
    ('vect', CountVectorizer(lowercase=False)),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.SVC(random_state=123))
#     ('clf', LinearSVC(random_state=123))
])

param_grid = {
    'vect__max_features': [500, 1000, 2000, 6000],
    'vect__ngram_range': [(1, 1)],
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__gamma': [0.001, 0.01, 0.1, 1]
}

grid = GridSearchCV(pipeline, cv=k, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)

In [None]:
# summarize results# summa 
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

In [None]:
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.937398 using {'clf__C': 0.1, 'vect__max_features': 6000, 'vect__ngram_range': (1, 1)}