In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [2]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred)) / float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [3]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data[['text', 'subj']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    data = data.join(encoded_subjects)
    return data, mlb.classes_

In [4]:
train, categories = prepare_data('learn.txt')
test, _ = prepare_data('test.txt')
print('Categoreis: {}'.format(categories))
train.head()

Categoreis: ['00' 'e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5'
 'f7' 'f8' 'f9' 'z7']


Unnamed: 0,text,00,e1,e2,e3,e4,e5,e7,e8,e9,f1,f2,f3,f4,f5,f7,f8,f9,z7
0,investigating chemistry oceans using flow inje...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,new generation pna analogues effective cell pe...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,solvent effects new spin probes talk deals dev...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,novel aryne induced reactions applications fun...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,direct arylation polymerization sustainable sy...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [5]:
subjects = pd.read_csv('subjects.txt', sep="\t", header=None, names=['code', 'desc_rus', 'description'])[['code', 'description']]
subjects.head()

Unnamed: 0,code,description
0,e1,COMPUTERS; ELECTRONICS
1,e2,ASTRONOMY
2,e3,BIOLOGY; MEDICAL SCIENCES
3,e4,GEOGRAPHY; GEOPHYSICS
4,e5,GEOLOGY; EARTH SCIENCES; MINES AND MINING INDU...


In [6]:
#train_sample = train.sample(100000)
#test_sample = test.sample(5000)

In [7]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

In [8]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.2, ngram_range=(1, 2), max_features=None)),
    ('tfidf', TfidfTransformer(norm='l2', use_idf=False)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
], memory='cache')

In [9]:
pipeline.fit(X_train, Y_train)

Pipeline(memory='cache',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [16]:
pipeline.score(X_test, Y_test)

0.6906091533474549

In [17]:
predictions = pipeline.predict(X_test)
successful = 0
total = len(predictions)

for i in range(total):
    actual = np.array(Y_test.iloc[i])
    
    if (i % 10000 == 0):
        print(i)
    
    if (abs(np.sum(predictions[i] - actual)) <= 1):
        successful += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [18]:
successful / total

0.985675161007339

In [19]:
hamming_score(np.array(Y_test), predictions)

0.7663699709723203

In [20]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_test), predictions, target_names=categories))

             precision    recall  f1-score   support

         00       0.00      0.00      0.00       694
         e1       0.83      0.67      0.74      8904
         e2       0.85      0.76      0.80      1742
         e3       0.93      0.87      0.90     13456
         e4       0.74      0.66      0.70      2658
         e5       0.83      0.75      0.79      3198
         e7       0.73      0.48      0.58       119
         e8       0.66      0.71      0.68      2749
         e9       0.67      0.24      0.36      1883
         f1       0.75      0.57      0.65      3272
         f2       0.68      0.43      0.52      1643
         f3       0.70      0.67      0.68      5879
         f4       0.90      0.74      0.81      1154
         f5       0.79      0.82      0.81     19744
         f7       0.91      0.88      0.89     36871
         f8       0.84      0.09      0.16       302
         f9       0.79      0.66      0.72      5184
         z7       1.00      0.40      0.57   

In [21]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(np.array(Y_test), predictions, average='weighted')

(0.8341074107181181, 0.7772458591044885, 0.8007902074354197, None)