In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV


In [2]:
def prepare_data(filename):
    data = pd.read_csv(filename, sep="\t")
    data = data.sample(10000)
    data = data[['text', 'subj','ipv']]
    data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
    data['ipv'] = data['ipv'].apply(lambda ipv: str(ipv).split('\\'))
    
    mlb = MultiLabelBinarizer()
    encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
    mlb_ipv = MultiLabelBinarizer()
    encoded_ipv = pd.DataFrame(mlb_ipv.fit_transform(data.pop('ipv')), columns=mlb_ipv.classes_, index=data.index)
    data = data.join(encoded_subjects).join(encoded_ipv)
    
    return data, mlb.classes_, mlb_ipv.classes_

In [3]:
train, categories, ipvs = prepare_data('learn_cleaned.txt')
test, _, _= prepare_data('test_cleaned.txt')
print('Categoreis: {}'.format(categories))
print('Categoreis: {}'.format(ipvs))
train.head()

Categoreis: ['00' 'e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5'
 'f7' 'f8' 'f9']
Categoreis: ['01A' '01G' '02A' '02B' '02V' '03' '04A1' '04A2' '04A3' '04A4' '04B1'
 '04B2' '04B3' '04B4' '04D3' '04D6' '04D7' '04I1' '04I11' '04I4' '04I8'
 '04I9' '04Ja2' '04Ja3' '04Ja4' '04Ja7' '04K1' '04M1' '04M2' '04M3' '04M6'
 '04M7' '04M8' '04N4' '04P1' '04R1' '04T4' '04T5' '04T6' '04V1' '04V2'
 '04V3' '04V4' '04V5' '04V7' '04V8' '04V9' '05A' '05B' '06A' '06B' '07D'
 '07P' '07R' '08A' '08E' '08K' '08M' '08N' '08P' '08V' '09A' '09B' '09G'
 '09I' '10AV' '10D' '10G' '11B' '11E' '11Zh' '12' '13A' '13B' '13D' '13V'
 '14A' '14B' '14D' '15A' '15B' '15D' '15E' '15G' '15I' '15V' '16A' '16B'
 '16D' '16G' '16V' '17A' '17G' '18A' '18B' '18E' '18F' '18G' '18I' '18K'
 '18L' '18P' '18T' '18U' '18V' '18Zh' '19A' '19B1' '19B2' '19B3' '19B4'
 '19E' '19F' '19GD' '19I' '19L' '19M' '19N' '19O' '19P' '19R1' '19S' '19T'
 '19U' '19V' '19Zh' '20B' '20D' '20I' '20M' '20R' '21AN' '21B' '21E' '21I'
 '21Ju' '21

Unnamed: 0,text,00,e1,e2,e3,e4,e5,e7,e8,e9,...,74,81,83,84,85,86,90,98,EE,nan
83720,alkenes alcohols cobalt catalyzed hydroformyla...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253187,surface geodesic pattern deformable texture ma...,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
70171,viability grid connected solar pv energy syste...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
165401,situating queer migration within national welf...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56909,influence unsaturated hydrocarbon ligands stab...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train = train.drop('00', 1)
train = train.drop('nan', 1)

In [5]:
categories = categories[1:]

In [6]:
categories

array(['e1', 'e2', 'e3', 'e4', 'e5', 'e7', 'e8', 'e9', 'f1', 'f2', 'f3',
       'f4', 'f5', 'f7', 'f8', 'f9'], dtype=object)

In [7]:
ipvs

array(['01A', '01G', '02A', '02B', '02V', '03', '04A1', '04A2', '04A3',
       '04A4', '04B1', '04B2', '04B3', '04B4', '04D3', '04D6', '04D7',
       '04I1', '04I11', '04I4', '04I8', '04I9', '04Ja2', '04Ja3', '04Ja4',
       '04Ja7', '04K1', '04M1', '04M2', '04M3', '04M6', '04M7', '04M8',
       '04N4', '04P1', '04R1', '04T4', '04T5', '04T6', '04V1', '04V2',
       '04V3', '04V4', '04V5', '04V7', '04V8', '04V9', '05A', '05B',
       '06A', '06B', '07D', '07P', '07R', '08A', '08E', '08K', '08M',
       '08N', '08P', '08V', '09A', '09B', '09G', '09I', '10AV', '10D',
       '10G', '11B', '11E', '11Zh', '12', '13A', '13B', '13D', '13V',
       '14A', '14B', '14D', '15A', '15B', '15D', '15E', '15G', '15I',
       '15V', '16A', '16B', '16D', '16G', '16V', '17A', '17G', '18A',
       '18B', '18E', '18F', '18G', '18I', '18K', '18L', '18P', '18T',
       '18U', '18V', '18Zh', '19A', '19B1', '19B2', '19B3', '19B4', '19E',
       '19F', '19GD', '19I', '19L', '19M', '19N', '19O', '19P', '19R1',
  

In [8]:
ipvs= ipvs[:-1]

In [9]:
ipvs

array(['01A', '01G', '02A', '02B', '02V', '03', '04A1', '04A2', '04A3',
       '04A4', '04B1', '04B2', '04B3', '04B4', '04D3', '04D6', '04D7',
       '04I1', '04I11', '04I4', '04I8', '04I9', '04Ja2', '04Ja3', '04Ja4',
       '04Ja7', '04K1', '04M1', '04M2', '04M3', '04M6', '04M7', '04M8',
       '04N4', '04P1', '04R1', '04T4', '04T5', '04T6', '04V1', '04V2',
       '04V3', '04V4', '04V5', '04V7', '04V8', '04V9', '05A', '05B',
       '06A', '06B', '07D', '07P', '07R', '08A', '08E', '08K', '08M',
       '08N', '08P', '08V', '09A', '09B', '09G', '09I', '10AV', '10D',
       '10G', '11B', '11E', '11Zh', '12', '13A', '13B', '13D', '13V',
       '14A', '14B', '14D', '15A', '15B', '15D', '15E', '15G', '15I',
       '15V', '16A', '16B', '16D', '16G', '16V', '17A', '17G', '18A',
       '18B', '18E', '18F', '18G', '18I', '18K', '18L', '18P', '18T',
       '18U', '18V', '18Zh', '19A', '19B1', '19B2', '19B3', '19B4', '19E',
       '19F', '19GD', '19I', '19L', '19M', '19N', '19O', '19P', '19R1',
  

In [10]:
grouped_by_categories = {}

for category in categories:
    grouped_by_categories[category] = train[train[category] == 1]

In [11]:
print('Categoreis: {}'.format(categories))


Categoreis: ['e1' 'e2' 'e3' 'e4' 'e5' 'e7' 'e8' 'e9' 'f1' 'f2' 'f3' 'f4' 'f5' 'f7'
 'f8' 'f9']


In [12]:
X_train = train.text
X_test = test.text
Y_train = train[categories]
Y_test = test[categories]

In [13]:
ipv = pd.read_csv('Kody.txt', sep="\s+", header=None, names=[1, 2, 3, 4, 5])[[1, 2]]
ipv.head()

Unnamed: 0,1,2
0,01A,e1
1,01G,e1
2,02A,e9
3,02B,f4
4,02V,f4


In [14]:
ipvs_grouped_by_categories = {}

for category in categories:
    ipvs_grouped_by_categories[category] = ipv[ipv[2] == category]

In [15]:
classifiers = {}

In [51]:
def create_model():
    pipeline = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2), max_features=None)),
        ('tfidf', TfidfTransformer(norm='l2', use_idf=False)),
        ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(), cv=10), n_jobs=-1))
    ])
    return pipeline

In [19]:
for category in categories:
    print(category)
    train_for_category = grouped_by_categories[category]
    X_train = train_for_category.text
    print('X_train')
    print(X_train)
    ipvs_for_category = ipvs_grouped_by_categories[category][[1]].values.reshape((1, -1)).tolist()[0]
    Y_train = train_for_category[ipvs_for_category]
    print('ipvs_for_category')
    print(ipvs_for_category)
    print('Y_train')
    print(Y_train)
    classifiers[category] = create_model()
    classifiers[category].fit(X_train, Y_train)

e1
X_train
70366     smart places multi agent based smart mobile vi...
205209    comparison current ratio mobility between sige...
39860     iterative refinement hierarchical tt meshes ba...
26531     study semi supervised fcm algorithm most varia...
127033    reduction effect electron relaxation behavior ...
28911     methodology characterization flow conductivity...
199358    exploiting significance computations energy co...
85818     building consistent framework executable syste...
234133    micc tool computing short distances curve comp...
97828     using entropy based mean shift filter modified...
7954      design simulation hybrid cmos set circuits sin...
122840    modeling passive uhf rfid multipath channel ta...
135182    fourier domain mode delay measurement multimod...
239674    enhanced photovoltaic performance inverted pol...
118616    quantified abstract configurations distributed...
267697    size reduction composite right left handed tra...
260110    plasmon resonance e

e2
X_train
238058    automatic cloud detection high resolution sate...
234811    supersymmetric sneutrino higgs inflation abstr...
8911      astrium spaceplane scientific missions years n...
133360    revised terrain correction method forest canop...
113180    structural knowledge learning maps supervised ...
77452     main belt comets pan starrs perspective analyz...
182459    groundwater flow induced collapse flooding noc...
93288     seismic model mars effects hydration arguments...
248295    excess alanine amino acids synthesized plasma ...
94530     regional atmospheric influence chandler wobble...
62919     quantum hoop conjecture black hole formation p...
257624    intensity based stochastic model terrestrial l...
14161     high resolution terahertz spectroscopy quantum...
35660     modulation auroras pc pulsations dawn sector a...
270764    nonminimally coupled massless scalar hair sphe...
133245    determination age oil palm crown projection ar...
46741     improved dark objec

e4
X_train
98853     evolution topography post devonian scandinavia...
92475     reservoir induced seismicity high seismicity r...
98829     influence aerosols life cycle radiation fog ev...
194649    reconstructing evolution eroded miocene calder...
171078    variations macrobenthic community structure re...
92273     decomposition allocation energy related carbon...
253468    cooperation competition climate change policie...
231887    role forest fire severity vegetation recovery ...
62507     lake connectivity fish species richness southe...
113180    structural knowledge learning maps supervised ...
264388    gis based reconstruction late weichselian prog...
64176     constraining input output fluxes southern cent...
42497     spatiotemporal characteristics regional drough...
261025    analysis demonstration investment implementati...
42799     how big how bad how often extreme events accou...
91874     prediction heavy rainfall events rangamati ban...
88376     hotspot swells revi

e7
X_train
86833     novel harmony search based algorithms part of ...
195669    linguistic construal disciplinarity data minin...
69306     organizational knowledge management related an...
138759    serials worth their weight knowledge value stu...
75442     empirical study factors influencing user perce...
119793    linguistically motivated taxonomy machine tran...
94552     relationship library assessment student retent...
190895    text matrix tool increase cohesion extensive t...
114794    supervised topic models word order structure d...
171439    driven adaptation grounded theory study licens...
80753     answering keyword queries cached subqueries be...
120436    finding parents orphan works using genealogica...
11386     recent trends information organization researc...
169395    bibliometric network analysis field computatio...
95522     assisting digital interoperability preservatio...
254585    which type citation analysis generates most ac...
224695    user behaviours cri

KeyError: "['33'] not in index"

In [52]:
pipeline = create_model()

In [53]:
some_sample = train.sample(10000)

In [54]:
X_sample = some_sample.text
Y_sample = some_sample[categories]

In [55]:
some_sample.head()

Unnamed: 0,text,e1,e2,e3,e4,e5,e7,e8,e9,f1,...,73,74,81,83,84,85,86,90,98,EE
41584,additive modelling reveals spatiotemporal pcbs...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46719,assessment satellite ocean color products meri...,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
22239,uplc versus hplc drug analysis advantageous ap...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251993,static arrangement capillary porous system cps...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47266,controlled growth sinps plasma synthesis quant...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
pipeline.fit(X_sample, Y_sample)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...tate=None, tol=0.0001,
     verbose=0),
            cv=10, method='sigmoid'),
          n_jobs=-1))])

In [57]:
test = pipeline.predict(X_test)

In [60]:
test

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [170]:
probs = pipeline.predict_proba(X_test)

In [171]:
probs

array([[1.26261438e-02, 1.13832855e-03, 9.44408654e-01, ...,
        5.52761095e-02, 1.12587962e-03, 2.05641418e-02],
       [7.28064839e-02, 2.43053995e-02, 2.33380468e-02, ...,
        2.51134089e-02, 1.16962261e-03, 4.61963110e-02],
       [1.27298268e-02, 7.03842539e-03, 2.65388386e-03, ...,
        5.46420010e-03, 2.15174378e-03, 5.51437236e-02],
       ...,
       [8.84849899e-03, 1.41877865e-03, 2.51821560e-03, ...,
        9.97046376e-01, 4.35514661e-04, 6.93922817e-02],
       [1.36680512e-02, 7.47791372e-03, 1.90445469e-03, ...,
        6.78980299e-03, 8.30052906e-04, 5.80869879e-02],
       [5.60190480e-03, 5.17157319e-04, 4.72557443e-04, ...,
        5.55100455e-01, 6.41978847e-04, 3.66694061e-02]])

In [172]:
proba = pipeline.predict_proba([X_test[33146]])

In [173]:
proba

array([[0.01262614, 0.00113833, 0.94440865, 0.00307145, 0.00363376,
        0.00134895, 0.00448919, 0.00299574, 0.00430893, 0.00342805,
        0.01602851, 0.0053756 , 0.06187566, 0.05527611, 0.00112588,
        0.02056414]])

In [70]:
for i in range(len(proba)):
            for j in proba[i]:
                if j == 1:
                    output = categories[proba[i] == 1]
                    print(output)

In [72]:
categories

array(['e1', 'e2', 'e3', 'e4', 'e5', 'e7', 'e8', 'e9', 'f1', 'f2', 'f3',
       'f4', 'f5', 'f7', 'f8', 'f9'], dtype=object)

In [71]:
categories[proba[i] == 1]

array([], dtype=object)

In [103]:
prediction

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [112]:
prediction[0][5] = 1
prediction.append(np.array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]))

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [104]:
proba

array([[0.01262614, 0.00113833, 0.94440865, 0.00307145, 0.00363376,
        0.00134895, 0.00448919, 0.00299574, 0.00430893, 0.00342805,
        0.01602851, 0.0053756 , 0.06187566, 0.05527611, 0.00112588,
        0.02056414]])

In [123]:
X_test

33146    annexin nuclear translocation induces retinal ...
53415    analysis primary bleed feed strategies selecte...
88689    nonlinear magnetoelectric effects composite fe...
14963    brittle to ductile transition ti tib metal mat...
6865     microstructural analysis orientation dependent...
85098    ethyl cellulose cellulose acetate carboxymethy...
40302    generalized derivatives hybrid systems establi...
16058    descriptors regions interest fusion in cross d...
72639    traumatic vertebral artery dissection high sch...
65155    exploring rapidio technology within daq system...
1931     role structural compositional details atomisti...
886      thk pet assessing neurofibrillary pathology al...
74108    reactivity synthetic applications functionaliz...
84422    stimuli responsive ion gels based polysacchari...
77066    vitamin saudi arabia prevalence distribution d...
40291    distributed finite element kalman filter field...
32981    performance probiotic fermented sheep milk ice.

In [126]:
prediction = pipeline.predict(X_test)
#proba = pipeline.predict_proba(["annexin nuclear translocation induces retinal ganglion cell apoptosis ischemia reperfusion injury il pathway abstract degeneration retinal ganglion cells rgcs been identified major problem glaucoma previous studies indicated association between annexin anxa neuronal cell apoptosis rgcs apoptosis acute ischemia reperfusion attributed increased production il found expression nuclear translocation anxa upregulated models acute ischemia reperfusion rgcs vivo anxa found promoting effect expression il primary cultured rgcs which could inhibited treatment anxa shrna inhibitor bay anxa interacted recruited nucleus chromatin immunoprecipitation assay revealed anxa accumulated il gene promoter reduction nuclear translocation using membrane permeable anxa peptide containing ser ala mutation led decrease expression il acute ischemia reperfusion induced rgcs apoptosis vivo results indicate rgcs anxa increases il expression recruiting nucleus which induces cell apoptosis obtained results may help development novel treatment strategy rgcs apoptosis acute ischemia reperfusion injury annexin nuclear translocation retinal ganglion cell il"])          

In [136]:
type(prediction)

numpy.ndarray

In [153]:
output = np.array([])

In [154]:
import numpy as np

In [169]:
prediction = pipeline.predict([X_test[33146]])
for i in range(len(prediction)):
    for j in prediction[i]:
        
        if j == 1:
            
            output = categories[prediction[i] == 1]
            output2 = proba[prediction == 1]
            print(output, output2)

['e3'] [0.94440865]


In [159]:
proba = pipeline.predict_proba([X_test[33146]])

In [160]:
proba

array([[0.01262614, 0.00113833, 0.94440865, 0.00307145, 0.00363376,
        0.00134895, 0.00448919, 0.00299574, 0.00430893, 0.00342805,
        0.01602851, 0.0053756 , 0.06187566, 0.05527611, 0.00112588,
        0.02056414]])

In [156]:
np.array([categories])[prediction == 1]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 10000

In [168]:
proba[prediction == 1]

array([0.94440865])

In [127]:
output

['f5', 'f7']

In [162]:

for i in range(len(output)):
    for j in range(len(categories)):
        if categories[j] == output[i]:
            print(proba[i][j])

In [61]:
import pickle

In [62]:
pickle.dump(pipeline, open('model-svm.pkl','wb'))

In [59]:
probs

array([[1.26261438e-02, 1.13832855e-03, 9.44408654e-01, ...,
        5.52761095e-02, 1.12587962e-03, 2.05641418e-02],
       [7.28064839e-02, 2.43053995e-02, 2.33380468e-02, ...,
        2.51134089e-02, 1.16962261e-03, 4.61963110e-02],
       [1.27298268e-02, 7.03842539e-03, 2.65388386e-03, ...,
        5.46420010e-03, 2.15174378e-03, 5.51437236e-02],
       ...,
       [8.84849899e-03, 1.41877865e-03, 2.51821560e-03, ...,
        9.97046376e-01, 4.35514661e-04, 6.93922817e-02],
       [1.36680512e-02, 7.47791372e-03, 1.90445469e-03, ...,
        6.78980299e-03, 8.30052906e-04, 5.80869879e-02],
       [5.60190480e-03, 5.17157319e-04, 4.72557443e-04, ...,
        5.55100455e-01, 6.41978847e-04, 3.66694061e-02]])

In [51]:
test

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [52]:
df = pd.DataFrame(test, columns=categories)

In [53]:
df

Unnamed: 0,e1,e2,e3,e4,e5,e7,e8,e9,f1,f2,f3,f4,f5,f7,f8,f9
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [40]:
X_test

82193    fusing disparate object signatures salient obj...
69545    understanding polychalcogenides building block...
14827    palmprint recognition local micro structure te...
42938    numerical prediction fretting fatigue crack tr...
68381    effects si al ti corrosion ni cr ni cr alloys ...
13023    monitoring emerging pollutants guadiamar river...
69004    synthesis anti oxidant activity evaluation ana...
53605    ni ceo based catalysts oxygen vectors chemical...
28119    catalytic deoxygen reaction landfill gas lfg i...
42978    pseudo model hydraulic fracture growth layered...
5278     relaxor properties bi tio batio ceramics study...
68882    functional groups docking pvdf membranes novel...
1101     morphological compositional characteristics bi...
66827    pozzolanic reaction lightweight fine aggregate...
37443    manycast routing modulation level spectrum ass...
71696    influence lifestyle airborne particle surface ...
71768    effect temperature partial molar volumes parti.

In [31]:
dude_sample

Unnamed: 0,text,e1,e2,e3,e4,e5,e7,e8,e9,f1,...,73,74,81,83,84,85,86,90,98,EE
224359,non volatile resistive memory device fabricate...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
pipeline.predict(dude_sample.text)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [33]:
classifiers['e3'].predict(dude_sample.text)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [118]:
dude_sample[ipvs_grouped_by_categories['f5'][[1]].values.reshape((1, -1)).tolist()[0]]

Unnamed: 0,18Б,18В,18Г,18Е,18Ж,18И,18К,18Л,18П,18Т,18У,18Ф
156408,0,0,0,0,0,0,0,0,1,0,0,0


In [116]:
ipvs_grouped_by_categories['f5']

Unnamed: 0,1,2
86,18Б,f5
87,18В,f5
88,18Г,f5
89,18Е,f5
90,18Ж,f5
91,18И,f5
92,18К,f5
93,18Л,f5
94,18П,f5
95,18Т,f5


In [114]:
dude_sample[categories]

Unnamed: 0,00,e1,e2,e3,e4,e5,e7,e8,e9,f1,f2,f3,f4,f5,f7,f8,f9,z7
156408,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [113]:
dude_sample.text.values[0]

'apex angle dependent resonances triangular split ring resonators other frequency selective structures pendry et al ieee trans microw theory tech circles squares triangular split ring resonators tsrrs only allow frequencies center resonant frequency propagate further tsrrs attractive due their small surface area vidhyalakshmi et al stopband characteristics complementary triangular split ring resonator loaded microstrip line comparatively large quality factors previously investigated gay balmaz et al appl phys work examine effects varying apex angle resonant frequency factor phase shift imparted tsrr element within ghz frequency regime'

In [9]:
pipeline.fit(X_train, Y_train)

Pipeline(memory='cache',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=-1))])

In [16]:
pipeline.score(X_test, Y_test)

0.6906091533474549

In [17]:
predictions = pipeline.predict(X_test)
successful = 0
total = len(predictions)

for i in range(total):
    actual = np.array(Y_test.iloc[i])
    
    if (i % 10000 == 0):
        print(i)
    
    if (abs(np.sum(predictions[i] - actual)) <= 1):
        successful += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [18]:
successful / total

0.985675161007339

In [19]:
hamming_score(np.array(Y_test), predictions)

0.7663699709723203

In [20]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_test), predictions, target_names=categories))

             precision    recall  f1-score   support

         00       0.00      0.00      0.00       694
         e1       0.83      0.67      0.74      8904
         e2       0.85      0.76      0.80      1742
         e3       0.93      0.87      0.90     13456
         e4       0.74      0.66      0.70      2658
         e5       0.83      0.75      0.79      3198
         e7       0.73      0.48      0.58       119
         e8       0.66      0.71      0.68      2749
         e9       0.67      0.24      0.36      1883
         f1       0.75      0.57      0.65      3272
         f2       0.68      0.43      0.52      1643
         f3       0.70      0.67      0.68      5879
         f4       0.90      0.74      0.81      1154
         f5       0.79      0.82      0.81     19744
         f7       0.91      0.88      0.89     36871
         f8       0.84      0.09      0.16       302
         f9       0.79      0.66      0.72      5184
         z7       1.00      0.40      0.57   

In [21]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(np.array(Y_test), predictions, average='weighted')

(0.8341074107181181, 0.7772458591044885, 0.8007902074354197, None)