In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [2]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data[['text', 'subj']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    data = data.join(encoded_subjects)
    return data, mlb.classes_

In [3]:
train, categories = prepare_data('learn.txt')
test, _ = prepare_data('test.txt')
print('Categoreis: {}'.format(categories))


Categoreis: ['00' 'e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5'
 'f7' 'f8' 'f9' 'z7']


In [4]:
train = train.drop('00', 1)
train = train.drop('z7', 1)

In [5]:
categories = categories[1:]
categories = categories[:-1]

In [6]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.2, ngram_range=(1, 2), max_features=None)),
    ('tfidf', TfidfTransformer(norm='l2', use_idf=False)),
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()), n_jobs=6))
], memory='cache')

In [None]:
pipeline.fit(X_train, Y_train)


In [None]:
pickle.dump(pipeline, open(os.environ["RESULT_DIR"]+"/mynewclass.pkl",'wb'))