In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report


class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(str(w), p, t) for w, p, t in zip(s["WORD"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["CAT"].values.tolist())]
        self.grouped = self.data.groupby("SENT_NO").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped[str(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

def word2features(sent, i):

    word = sent[i][0]
    postag = sent[i][1]
    features = {
    # 'bias': 1.0,
    # 'word.lower()': word.lower(),
    # 'word[-3:]': word[-3:],
    # 'word[-2:]': word[-2:],
    # 'word.isupper()': word.isupper(),
    # 'word.istitle()': word.istitle(),
    # 'word.isdigit()': word.isdigit(),
    'word':word,
    'postag': postag,

    }
    # if i > 0:
    #     word1 = sent[i-1][0]
    #     postag1 = sent[i-1][1]
    #     features.update({
    #     '-1:word.lower()': word1.lower(),
    #     '-1:word.istitle()': word1.istitle(),
    #     '-1:word.isupper()': word1.isupper(),
    #     '-1:postag': postag1,
    #     '-1:postag[:2]': postag1[:2],
    #     })
    # else:features['BOS'] = True
    # if i < len(sent)-1:
    #     word1 = sent[i+1][0]
    #     postag1 = sent[i+1][1]
    #     features.update({
    #     '+1:word.lower()': word1.lower(),
    #     '+1:word.istitle()': word1.istitle(),
    #     '+1:word.isupper()': word1.isupper(),
    #     '+1:postag': postag1,
    #     })
    # else:features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]


corpus=pd.read_csv('C:\\Users\\dell\\PycharmProjects\\LBD_test\\NCBI_corpus_trainset.csv',index_col=0,dtype='str')
getter=SentenceGetter(corpus)
#print(getter.get_next())
sentences = getter.sentences
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

crf.fit(X, y)
import IPython
from eli5 import show_weights
show_weights(crf)

#加入更多特征以及前后单词的特征以后，precision和recall率均有提高


                  precision    recall  f1-score   support

               0       0.94      0.98      0.96    114771
CompositeMention       0.40      0.26      0.32       665
    DiseaseClass       0.56      0.29      0.39      2406
        Modifier       0.41      0.22      0.29      2356
 SpecificDisease       0.60      0.41      0.49      7929

        accuracy                           0.92    128127
       macro avg       0.58      0.44      0.49    128127
    weighted avg       0.90      0.92      0.91    128127



From \ To,0,CompositeMention,DiseaseClass,Modifier,SpecificDisease
0,2.44,0.549,0.364,1.038,0.801
CompositeMention,-3.376,4.139,-3.327,-3.322,-2.921
DiseaseClass,-1.371,0.0,3.557,-2.282,-2.111
Modifier,-0.799,0.0,-2.393,3.766,-1.616
SpecificDisease,-0.82,-1.201,-1.622,-0.446,3.584

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+5.052,word:patients..,,,
+4.750,word:suppressor,,,
+4.674,word:phenotype..,,,
+4.665,word:and/or,,,
+4.647,word:characteristic,,,
+4.197,word:families..,,,
+4.184,word:due,,,
+4.050,word:comprises,,,
+4.024,word:carrier,,,
+3.969,word:protein..,,,

Weight?,Feature
+5.052,word:patients..
+4.750,word:suppressor
+4.674,word:phenotype..
+4.665,word:and/or
+4.647,word:characteristic
+4.197,word:families..
+4.184,word:due
+4.050,word:comprises
+4.024,word:carrier
+3.969,word:protein..

Weight?,Feature
+6.156,word:and/or
+5.047,word:Duchenne/Becker
+4.141,word:Becker
+4.123,word:ovarian
+3.522,word:bilateral
+3.459,word:epididymis
+3.255,word:predisposition
+3.241,word:cysts
+3.142,word:BRCA-linked
+3.067,word:endometrial

Weight?,Feature
+7.274,word:lipomas
+7.203,word:hypopigmentation
+6.299,word:enzymopathy
+5.965,word:arthralgias
+5.932,word:myopia
+5.894,word:chondrodysplasia
+5.867,word:hyperphenylalaninemia
+5.798,word:neurodegeneration
+5.601,word:insufficiency
+5.566,word:adenomas

Weight?,Feature
+7.935,word:HD
+7.559,word:FAP
+7.529,word:TSD
+7.072,word:CHM
+6.922,word:FMF
+6.649,word:VWS
+6.464,word:choroideremia
+6.432,word:AKU
+6.345,word:Tay-Sachs
+6.309,word:MLD

Weight?,Feature
+7.421,word:AS
+6.950,word:adrenomyeloneuropathy
+6.828,word:HD
+6.554,word:AMN
+6.431,word:PMD
+6.292,word:malaria
+6.173,word:CP
+6.094,word:FMF
+6.036,word:phenylketonuria
+5.992,word:FAP


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report


class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(str(w), p, t) for w, p, t in zip(s["WORD"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["CAT"].values.tolist())]
        self.grouped = self.data.groupby("SENT_NO").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped[str(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

def word2features(sent, i):

    word = sent[i][0]
    postag = sent[i][1]
    features = {
    'bias': 1.0,
    'word.lower()': word.lower(),
    'word[-3:]': word[-3:],
    'word[-2:]': word[-2:],
    'word.isupper()': word.isupper(),
    'word.istitle()': word.istitle(),
    'word.isdigit()': word.isdigit(),
    'postag': postag,

    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
        '-1:word.lower()': word1.lower(),
        '-1:word.istitle()': word1.istitle(),
        '-1:word.isupper()': word1.isupper(),
        '-1:postag': postag1,
        '-1:postag[:2]': postag1[:2],
        })
    else:features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
        '+1:word.lower()': word1.lower(),
        '+1:word.istitle()': word1.istitle(),
        '+1:word.isupper()': word1.isupper(),
        '+1:postag': postag1,
        })
    else:features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]


corpus=pd.read_csv('NCBI_corpus_trainset.csv',index_col=0,dtype='str')
getter=SentenceGetter(corpus)
#print(getter.get_next())
sentences = getter.sentences
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

crf.fit(X, y)
import IPython
from eli5 import show_weights
show_weights(crf)

                  precision    recall  f1-score   support

               0       0.95      0.99      0.97    114771
CompositeMention       0.68      0.31      0.43       665
    DiseaseClass       0.61      0.33      0.43      2406
        Modifier       0.79      0.44      0.57      2356
 SpecificDisease       0.71      0.54      0.61      7929

        accuracy                           0.93    128127
       macro avg       0.75      0.52      0.60    128127
    weighted avg       0.92      0.93      0.93    128127



From \ To,0,CompositeMention,DiseaseClass,Modifier,SpecificDisease
0,2.947,-0.706,-1.952,-1.269,-0.814
CompositeMention,-3.057,5.299,-3.184,-2.357,-1.646
DiseaseClass,-1.92,0.0,2.983,-2.538,-1.994
Modifier,-1.121,0.0,-2.713,3.949,-1.611
SpecificDisease,-1.279,-0.457,-2.165,-0.607,4.333

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.710,-1:word.lower():dystrophy,,,
+4.164,word.lower():analysis,,,
+4.010,-1:word.lower():nephropathy,,,
+3.821,word.lower():neurofibrosarcoma,,,
+3.757,-1:word.lower():ataxia-telangiectasia,,,
+3.711,-1:word.lower():adrenomyeloneuropathy,,,
+3.674,-1:word.lower():meningococcemia,,,
+3.476,-1:word.lower():syndrome,,,
+3.440,-1:word.lower():aniridia,,,
+3.426,+1:word.lower():corneal,,,

Weight?,Feature
+4.710,-1:word.lower():dystrophy
+4.164,word.lower():analysis
+4.010,-1:word.lower():nephropathy
+3.821,word.lower():neurofibrosarcoma
+3.757,-1:word.lower():ataxia-telangiectasia
+3.711,-1:word.lower():adrenomyeloneuropathy
+3.674,-1:word.lower():meningococcemia
+3.476,-1:word.lower():syndrome
+3.440,-1:word.lower():aniridia
+3.426,+1:word.lower():corneal

Weight?,Feature
+3.175,-1:word.lower():testing
+2.657,word.lower():breast-ovarian
+2.480,word.lower():duchenne/becker
+2.329,-1:word.lower():duchenne/becker
+2.328,word[-3:]:ker
+2.205,+1:word.lower():predisposition
+2.114,+1:word.lower():results
+2.105,+1:word.lower():lip/palate
+2.094,word.lower():predisposition
+2.090,-1:word.lower():becker

Weight?,Feature
+7.701,word.lower():hypopigmentation
+6.056,word.lower():neurodegeneration
+3.826,word.lower():demyelination
+3.750,-1:word.lower():dysplasia
+3.490,word.lower():adenoma
+3.394,-1:word.lower():pathway
+3.310,word.lower():immunodeficiency
+3.182,word.lower():lipomas
+3.029,word.lower():goiter
+3.011,word.lower():leukemia

Weight?,Feature
+5.419,word.lower():aniridia-associated
+5.260,word.lower():c5-deficient
+4.898,word.lower():obese
+4.402,word.lower():hd-affected
+4.000,word.lower():cataract
+3.779,word.lower():neoplastic
+3.712,-1:word.lower():mcf-7
+3.711,word.lower():nonneoplastic
+3.675,word.lower():tumor-specific
+3.601,word.lower():c2-deficient

Weight?,Feature
+6.688,word.lower():cataracts
+5.800,word.lower():obesity
+5.659,word.lower():hypomyelination
+4.444,word.lower():goiter
+4.018,word.lower():hypotonic
+3.909,word.lower():nephritis..
+3.556,word[-2:]:GS
+3.471,word.lower():piebaldism
+3.471,word[-2:]:PS
+3.444,word.lower():deficiencies


In [None]:
crf = CRF(algorithm = 'lbfgs' ,
c1 = 10 ,
c2 = 0.1 ,
max_iterations = 100 ,
all_possible_transitions = False)
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

crf.fit(X, y)
show_weights(crf)
