In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [2]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred)) / float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [3]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data[['text', 'subj','ipv']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    data['ipv'] = data['ipv'].apply(lambda ipv: str(ipv).split('\\'))
    
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    mlb_ipv = MultiLabelBinarizer()
    encoded_ipv = pd.DataFrame(mlb_ipv.fit_transform(data.pop('ipv')), columns=mlb_ipv.classes_, index=data.index)
    data = data.join(encoded_subjects).join(encoded_ipv)
    
    return data, mlb.classes_, mlb_ipv.classes_

In [4]:
train, categories, ipvs = prepare_data('learn.txt')
test, _, _= prepare_data('test.txt')
print('Categoreis: {}'.format(categories))
print('Categoreis: {}'.format(ipvs))
train.head()

Categoreis: ['00' 'e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5'
 'f7' 'f8' 'f9' 'z7']
Categoreis: ['01А' '01Г' '02А' '02Б' '02В' '03' '04А1' '04А2' '04А3' '04А4' '04Б1'
 '04Б2' '04Б3' '04Б4' '04В1' '04В2' '04В3' '04В4' '04В5' '04В7' '04В8'
 '04В9' '04Д2' '04Д3' '04Д6' '04Д7' '04И1' '04И11' '04И4' '04И8' '04И9'
 '04К1' '04М1' '04М2' '04М3' '04М6' '04М7' '04М8' '04Н4' '04П1' '04Р1'
 '04Т4' '04Т5' '04Т6' '04Я2' '04Я3' '04Я4' '04Я7' '05А' '05Б' '06А' '06Б'
 '07Д' '07П' '07Р' '08А' '08Б' '08В' '08Е' '08К' '08М' '08Н' '08П' '09А'
 '09Б' '09Г' '09И' '10АВ' '10Г' '10Д' '11Б' '11Е' '11Ж' '12' '13А' '13Б'
 '13В' '13Д' '14А' '14Б' '14Д' '15А' '15Б' '15В' '15Г' '15Д' '15Е' '15И'
 '16А' '16Б' '16В' '16Г' '16Д' '17А' '17Г' '18А' '18Б' '18В' '18Г' '18Е'
 '18Ж' '18И' '18К' '18Л' '18П' '18Т' '18У' '18Ф' '19А' '19Б1' '19Б2'
 '19Б3' '19Б4' '19В' '19ГД' '19Е' '19Ж' '19И' '19Л' '19М' '19Н' '19О'
 '19П' '19Р1' '19С' '19Т' '19У' '19Ф' '20А' '20Б' '20Г' '20Д' '20И' '20К'
 '20М' '20П' '20Р' 

Unnamed: 0,text,00,e1,e2,e3,e4,e5,e7,e8,e9,...,74,81,83,84,85,86,90,98,nan,ЕЕ
0,investigating chemistry oceans using flow inje...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,new generation pna analogues effective cell pe...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,solvent effects new spin probes talk deals dev...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,novel aryne induced reactions applications fun...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,direct arylation polymerization sustainable sy...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_sample = train.sample(100000)
test_sample = test.sample(5000)

In [17]:
grouped_by_categories = {}

for category in categories:
    grouped_by_categories[category] = train[train[category] == 1]

In [18]:
grouped_by_categories['e1']

Unnamed: 0,text,00,e1,e2,e3,e4,e5,e7,e8,e9,...,74,81,83,84,85,86,90,98,nan,ЕЕ
78,transparent conducting properties anatase ti n...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
79,electrical photovoltaic properties si chitosan...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85,owa weighted based clustering method classific...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
86,strategic analysis successful brain korea init...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
87,three phase integrated model product configura...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
89,consumer credit scoring models limited data pa...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
90,causality hourly price volume relationship emp...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
172,systematic method construct hirota transformat...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
336,reliability analysis new technology based tran...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
342,high speed current conveyor based current comp...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

KeyError: "['01А' '01Г' '02А' '02Б' '02В' '03' '04А1' '04А2' '04А3' '04А4' '04Б1'\n '04Б2' '04Б3' '04Б4' '04В1' '04В2' '04В3' '04В4' '04В5' '04В7' '04В8'\n '04В9' '04Д2' '04Д3' '04Д6' '04Д7' '04И1' '04И11' '04И4' '04И8' '04И9'\n '04К1' '04М1' '04М2' '04М3' '04М6' '04М7' '04М8' '04Н4' '04П1' '04Р1'\n '04Т4' '04Т5' '04Т6' '04Я2' '04Я3' '04Я4' '04Я7' '05А' '05Б' '06А' '06Б'\n '07Д' '07П' '07Р' '08А' '08Б' '08В' '08Е' '08К' '08М' '08Н' '08П' '09А'\n '09Б' '09Г' '09И' '10АВ' '10Г' '10Д' '11Б' '11Е' '11Ж' '12' '13А' '13Б'\n '13В' '13Д' '14А' '14Б' '14Д' '15А' '15Б' '15В' '15Г' '15Д' '15Е' '15И'\n '16А' '16Б' '16В' '16Г' '16Д' '17А' '17Г' '18А' '18Б' '18В' '18Г' '18Е'\n '18Ж' '18И' '18К' '18Л' '18П' '18Т' '18У' '18Ф' '19А' '19Б1' '19Б2'\n '19Б3' '19Б4' '19В' '19ГД' '19Е' '19Ж' '19И' '19Л' '19М' '19Н' '19О'\n '19П' '19Р1' '19С' '19Т' '19У' '19Ф' '20А' '20Б' '20Г' '20Д' '20И' '20К'\n '20М' '20П' '20Р' '20С' '20Т' '20У' '20Ф' '21АН' '21Б' '21В' '21И' '21К'\n '21Л' '21Э' '21Ю' '22Д' '22Е' '22Ж' '22Р' '22С' '22Т' '22У' '22Ф' '22Ш'\n '23АБ' '23Г' '23Д' '24А' '24В' '24Е' '27' '29А' '29Б' '29В' '31' '32'\n '33' '34' '36' '37' '38' '39' '41' '43' '44' '45' '47' '48' '49' '51'\n '52' '59' '61' '62' '63' '66' '67Б' '69' '71' '72' '73' '74' '81' '83'\n '84' '85' '86' '90' '98' 'nan' 'ЕЕ'] not in index"

In [48]:

ipv = pd.read_csv('kody.txt', sep="\s+", header=None, names=[1, 2, 3, 4, 5])[[1, 2]]
ipv.head()

Unnamed: 0,1,2
0,01А,e1
1,01Г,e1
2,02А,e9
3,02Б,f4
4,02В,f4


In [51]:
ipvs_grouped_by_categories = {}

for category in categories:
    ipvs_grouped_by_categories[category] = ipv[ipv[2] == category]

In [92]:
classifiers = {}

array(['00', 'e1', 'e2', 'e3', 'e4', 'e5', 'e7', 'e8', 'e9', 'f1', 'f2',
       'f3', 'f4', 'f5', 'f7', 'f8', 'f9', 'z7'], dtype=object)

In [94]:
test.sample(1).text

88934    stability analysis total power control loop la...
Name: text, dtype: object

In [93]:
for category in categories[-2:]:
    print(category)
    train_for_category = grouped_by_categories[category]
    X_train = train_for_category.text
    ipvs_for_category = ipvs_grouped_by_categories[category][[1]].values.reshape((1, -1)).tolist()[0]
    Y_train = train_for_category[ipvs_for_category]
    classifiers[category] = create_model()
    classifiers[category].fit(X_train, Y_train)

f9
z7


ValueError: Unknown label type: (Empty DataFrame
Columns: []
Index: [219993, 223919, 242650, 254671, 262658, 264058],)

In [87]:
def create_model():
    pipeline = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2), max_features=None)),
        ('tfidf', TfidfTransformer(norm='l2', use_idf=False)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
    ])
    return pipeline

In [95]:
pipeline = create_model()

In [100]:
some_sample = train.sample(10000)

In [101]:
X_sample = some_sample.text
Y_sample = some_sample[categories]

In [None]:
train_sample.head()

In [102]:
pipeline.fit(X_sample, Y_sample)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [110]:
dude_sample = train.sample(1)

In [112]:
dude_sample.

Unnamed: 0,text,00,e1,e2,e3,e4,e5,e7,e8,e9,...,74,81,83,84,85,86,90,98,nan,ЕЕ
156408,apex angle dependent resonances triangular spl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
pipeline.predict(dude_sample.text)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [115]:
classifiers['f5'].predict(dude_sample.text)

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [118]:
dude_sample[ipvs_grouped_by_categories['f5'][[1]].values.reshape((1, -1)).tolist()[0]]

Unnamed: 0,18Б,18В,18Г,18Е,18Ж,18И,18К,18Л,18П,18Т,18У,18Ф
156408,0,0,0,0,0,0,0,0,1,0,0,0


In [116]:
ipvs_grouped_by_categories['f5']

Unnamed: 0,1,2
86,18Б,f5
87,18В,f5
88,18Г,f5
89,18Е,f5
90,18Ж,f5
91,18И,f5
92,18К,f5
93,18Л,f5
94,18П,f5
95,18Т,f5


In [114]:
dude_sample[categories]

Unnamed: 0,00,e1,e2,e3,e4,e5,e7,e8,e9,f1,f2,f3,f4,f5,f7,f8,f9,z7
156408,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [113]:
dude_sample.text.values[0]

'apex angle dependent resonances triangular split ring resonators other frequency selective structures pendry et al ieee trans microw theory tech circles squares triangular split ring resonators tsrrs only allow frequencies center resonant frequency propagate further tsrrs attractive due their small surface area vidhyalakshmi et al stopband characteristics complementary triangular split ring resonator loaded microstrip line comparatively large quality factors previously investigated gay balmaz et al appl phys work examine effects varying apex angle resonant frequency factor phase shift imparted tsrr element within ghz frequency regime'

In [9]:
pipeline.fit(X_train, Y_train)

Pipeline(memory='cache',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [16]:
pipeline.score(X_test, Y_test)

0.6906091533474549

In [17]:
predictions = pipeline.predict(X_test)
successful = 0
total = len(predictions)

for i in range(total):
    actual = np.array(Y_test.iloc[i])
    
    if (i % 10000 == 0):
        print(i)
    
    if (abs(np.sum(predictions[i] - actual)) <= 1):
        successful += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [18]:
successful / total

0.985675161007339

In [19]:
hamming_score(np.array(Y_test), predictions)

0.7663699709723203

In [20]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_test), predictions, target_names=categories))

             precision    recall  f1-score   support

         00       0.00      0.00      0.00       694
         e1       0.83      0.67      0.74      8904
         e2       0.85      0.76      0.80      1742
         e3       0.93      0.87      0.90     13456
         e4       0.74      0.66      0.70      2658
         e5       0.83      0.75      0.79      3198
         e7       0.73      0.48      0.58       119
         e8       0.66      0.71      0.68      2749
         e9       0.67      0.24      0.36      1883
         f1       0.75      0.57      0.65      3272
         f2       0.68      0.43      0.52      1643
         f3       0.70      0.67      0.68      5879
         f4       0.90      0.74      0.81      1154
         f5       0.79      0.82      0.81     19744
         f7       0.91      0.88      0.89     36871
         f8       0.84      0.09      0.16       302
         f9       0.79      0.66      0.72      5184
         z7       1.00      0.40      0.57   

In [21]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(np.array(Y_test), predictions, average='weighted')

(0.8341074107181181, 0.7772458591044885, 0.8007902074354197, None)