In [29]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
import models

In [30]:
data = pd.read_csv("data.csv", quotechar='"')
X = data.text
y = data[["term1", "term2", "term3"]]

yvalues = y.as_matrix()

final = []
for i in range(0, len(yvalues)):
    r = yvalues[:][i:i + 1]
    r = np.array(r[[~pd.isnull(r)]])
    final.append(r)

final = np.array(final)

In [31]:
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(final)

In [32]:
cv = CountVectorizer(stop_words="english")

In [33]:
# Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=5)

In [34]:
X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

In [35]:
model = models.UniformOVA(c=1, t1=0.3, t2=0.1)
model.fit(X_train_dtm, Y_train)
Y_pred_class = model.predict(X_test_dtm)

score = metrics.f1_score(Y_test, Y_pred_class, average='samples')
print(score)

0.288372093023


In [36]:
from sklearn.model_selection import GridSearchCV
t1_range = np.arange(0.1, 1.1, 0.1)
t2_range = np.arange(0.1, 1.1, 0.1)
param_grid = dict(t1=t1_range, t2=t2_range)
model = models.UniformOVA()
grid = GridSearchCV(model, param_grid, cv=10, scoring="f1_samples")
%time grid.fit(X_train_dtm, Y_train)

In [28]:
# best score / params
print(grid.best_score_)
print(grid.best_params_)

0.362236823513
{'t1': 0.30000000000000004, 't2': 0.10000000000000001}
