# read data

In [1]:
import numpy as np
import pandas as pd

#read txt file
def read_txt(filename):
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

# read the training and testing data
traininglist = read_txt('../BERT/data_processed/train(no_BIO).txt')
testinglist = read_txt('../BERT/data_processed/test(no_BIO).txt')

for i in range(len(traininglist)):
    traininglist[i] = traininglist[i].split()

for i in range(len(testinglist)):
    testinglist[i] = testinglist[i].split()

# convert the data into dataframe
testingdataframe = pd.DataFrame(testinglist,columns=['text','label'])
trainingdataframe = pd.DataFrame(traininglist, columns=['text', 'label'])

print(testingdataframe.shape)
print(trainingdataframe.shape)

df_training = trainingdataframe.fillna(method='ffill')
df_testing = testingdataframe.fillna(method='ffill')

(316727, 2)
(489694, 2)


In [2]:
# check label counts
df_training.groupby('label').size().reset_index(name='counts'), df_testing.groupby('label').size().reset_index(name='counts')

(            label  counts
 0             CAD    5620
 1        DIABETES    2284
 2     FAMILY_HIST     254
 3  HYPERLIPIDEMIA     590
 4    HYPERTENSION    2188
 5      MEDICATION    6090
 6               O  470029
 7           OBESE     292
 8          SMOKER    2347,
             label  counts
 0             CAD    3951
 1        DIABETES    1676
 2     FAMILY_HIST     214
 3  HYPERLIPIDEMIA     493
 4    HYPERTENSION    1263
 5      MEDICATION    4148
 6               O  303157
 7           OBESE     168
 8          SMOKER    1657)

## Below convert data into binary tag

In [4]:
# into binary tag
df_training.loc[df_training.label != 'CAD',"label"] = 'O'
df_testing.loc[df_testing.label != 'CAD',"label"] = 'O'

# check label counts
df_testing.groupby('label').size().reset_index(name='counts'),df_training.groupby('label').size().reset_index(name='counts')

(  label  counts
 0   CAD    3951
 1     O  312776,
   label  counts
 0   CAD    5620
 1     O  484074)

# Vectorize and prepare data for ML input

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

x_train = df_training.drop('label',axis=1)
y_train = df_training.label.values
x_test = df_testing.drop('label',axis=1)
y_test = df_testing.label.values

# conbine train and test into one to vectorize to ensure same dimension
v = DictVectorizer(sparse=True)
tt = x_train.append(x_test)
ttt =  v.fit_transform(tt.to_dict('records'))
x_train = ttt[:x_train.shape[0]]
x_test = ttt[x_train.shape[0]:]

# define labels
classes = np.unique(y_train)
#new_classes.pop() #remove the last element 'O' from the list
classes

  tt = x_train.append(x_test)


array(['CAD', 'DIABETES', 'FAMILY_HIST', 'HYPERLIPIDEMIA', 'HYPERTENSION',
       'MEDICATION', 'O', 'OBESE', 'SMOKER'], dtype=object)

## tried to only use testing, similar result

In [49]:
trainingtext_with_o = df_testing.drop('label',axis=1)  #drop the label column
traininglabel_with_o = df_testing.label.values

v = DictVectorizer(sparse=True) #sparse=True means the output is a sparse matrix
trainingtext_with_o = v.fit_transform(trainingtext_with_o.to_dict('records')) #to_dict('records') means the output is a list of dictionaries
#print(trainingtext_with_o)

classes = np.unique(traininglabel_with_o) #get the unique labels as classes for performance evaluation
classes = classes.tolist() #convert the numpy array to list
#print(classes)

x_train, x_test, y_train, y_test = train_test_split(trainingtext_with_o, traininglabel_with_o, test_size = 0.3, random_state=0) 
x_train.shape, x_test.shape, y_train.shape, y_test.shape
# check new label counts
np.unique(y_train, return_counts=True), np.unique(y_test, return_counts=True)

((array(['CAD', 'O'], dtype=object), array([  2733, 218975], dtype=int64)),
 (array(['CAD', 'O'], dtype=object), array([ 1218, 93801], dtype=int64)))

## upsampling; downsampling

In [50]:
from sklearn.utils import resample
import numpy as np
from scipy.sparse import vstack
import random
random.seed(72)

def downsample(x_train, y_train):
    # make two dataframes, each with only one class 
    majority_df = x_train[y_train == "O"]
    minority_df = x_train[y_train != "O"]
    row = np.random.permutation(majority_df.shape[0])[:2*minority_df.shape[0]]
    print(majority_df.shape[0], minority_df.shape[0], row.shape[0])

    # remember the true label because need to combine with the downsampled majority class labels later
    t_lab = y_train[y_train == "CAD"]
    print(np.unique(t_lab, return_counts=True))
    print(np.unique(y_train[y_train == "O"], return_counts=True))

    # downsample majority class
    d_majority_df = majority_df[row]


    # combine minority class with downsampled majority class
    x_train = vstack((minority_df,d_majority_df))

    # combine the true label for y
    y_train = np.concatenate((t_lab,y_train[y_train == "O"][row]))
    
    return x_train, y_train

# downsample the training data
x_train, y_train = downsample(x_train, y_train)


218975 2733 5466
(array(['CAD'], dtype=object), array([2733], dtype=int64))
(array(['O'], dtype=object), array([218975], dtype=int64))


In [61]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

# check new label counts
np.unique(y_train, return_counts=True), np.unique(y_test, return_counts=True)

((array(['CAD', 'O'], dtype=object), array([  5620, 484074], dtype=int64)),
 (array(['CAD', 'O'], dtype=object), array([  3951, 312776], dtype=int64)))

# ML classifiers!!

### Perceptron

In [4]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5) #this is the perceptron model
per.partial_fit(x_train, y_train,classes=classes)
y_pred=per.predict(x_test)
print (classification_report(y_pred=per.predict(x_test), y_true=y_test, labels=classes)) 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 9.17, NNZs: 84, Bias: -0.040000, T: 489694, Avg. loss: 0.000246
Total training time: 0.20 seconds.
Norm: 67.58, NNZs: 4567, Bias: 0.010000, T: 489694, Avg. loss: 0.022582
Total training time: 0.20 seconds.


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.3s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.4s finished


Norm: 7.07, NNZs: 50, Bias: -0.080000, T: 489694, Avg. loss: 0.000449
Total training time: 0.24 seconds.
Norm: 19.87, NNZs: 395, Bias: -0.070000, T: 489694, Avg. loss: 0.003240
Total training time: 0.19 seconds.
Norm: 37.46, NNZs: 1403, Bias: -0.010000, T: 489694, Avg. loss: 0.005298
Total training time: 0.27 seconds.
Norm: 24.76, NNZs: 613, Bias: -0.050000, T: 489694, Avg. loss: 0.002597
Total training time: 0.26 seconds.
Norm: 31.29, NNZs: 979, Bias: -0.150000, T: 489694, Avg. loss: 0.008481
Total training time: 0.29 seconds.
Norm: 12.57, NNZs: 158, Bias: -0.080000, T: 489694, Avg. loss: 0.000502
Total training time: 0.21 seconds.
Norm: 30.66, NNZs: 940, Bias: -0.020000, T: 489694, Avg. loss: 0.002235
Total training time: 0.27 seconds.
                precision    recall  f1-score   support

           CAD       0.27      0.16      0.20      3951
      DIABETES       0.43      0.37      0.39      1676
   FAMILY_HIST       0.08      0.01      0.02       214
HYPERLIPIDEMIA       0.54  

In [5]:
with open('../BERT/perceptron_pred.txt', 'w') as f:
    for i in range(len(y_pred)):
        f.write(df_testing['text'][i] + ' ' + y_test[i] + ' ' + y_pred[i] + '\n')

## convert text level annotation into doc level

In [158]:
import re
from os import listdir

# read in the prediction file
pred = pd.read_csv('../BERT/perceptron_pred.txt', sep=' ', header=None)
file_path_3 = "C:/Users/Leste/OneDrive - Johns Hopkins/Desktop/BDD data/extracted/testing-RiskFactors-Complete/"
pred


Unnamed: 0,0,1,2
0,Record,SMOKER,O
1,date:,O,O
2,2069-04-07,O,O
3,Mr.,O,O
4,Villegas,O,O
...,...,...,...
316722,Team,O,O
316723,4,O,O
316724,Beeper,O,O
316725,#07736,O,O


In [None]:

def find_doc_tag (pred, file_path_3, type = 2):
    # get the text and label
    text = ' '.join(pred[0].astype(str))
    label = pred[type]

    # get the file name
    test_name = [f for f in listdir(file_path_3) if f.endswith('.xml')]
    test_name = [re.sub(r'\.xml', '', x) for x in test_name]

    # create a dataframe to store the annotation
    all_df = pd.DataFrame(np.zeros((len(test_name), 2), dtype=object), columns=['file', 'annotation'])

    # get the index of the first line of each file
    date = (pred[0].str.contains("date:")).values
    temp = [i for i, x in enumerate(date) if x]
    loc = [temp[i]-1 for i in range(len(temp)) if pred[0][temp[i]-1] == 'Record']
    print(loc)

    # get the annotation for each file
    tag = [np.setdiff1d(label[loc[x]:loc[x+1]].unique(),"O") for x in range(len(loc)-1)]
    tag.append(np.setdiff1d(label[loc[-1]:].unique(),"O"))
    print(tag)

    # add the annotation to the dataframe
    all_df['annotation'] = tag
    all_df['file'] = test_name
    return(all_df)


#df_pred = find_doc_tag(pred, file_path_3, type = 2)
df_orig = find_doc_tag(pred, file_path_3, type = 1)


In [169]:
df_orig

Unnamed: 0,file,annotation
0,110-01,"[HYPERTENSION, MEDICATION, SMOKER]"
1,110-02,"[HYPERTENSION, SMOKER]"
2,110-03,"[CAD, DIABETES, HYPERLIPIDEMIA, HYPERTENSION, ..."
3,110-04,"[CAD, DIABETES, HYPERLIPIDEMIA, HYPERTENSION, ..."
4,111-01,"[DIABETES, HYPERTENSION, MEDICATION, SMOKER]"
...,...,...
509,388-05,"[DIABETES, HYPERLIPIDEMIA, HYPERTENSION, MEDIC..."
510,389-01,[SMOKER]
511,389-02,"[DIABETES, HYPERTENSION, MEDICATION, SMOKER]"
512,389-03,"[DIABETES, MEDICATION, SMOKER]"


## get doc level annotationi from testing file

In [153]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from os import listdir
import re

file_path_3 = "C:/Users/Leste/OneDrive - Johns Hopkins/Desktop/BDD data/extracted/testing-RiskFactors-Complete/"
test_name = [f for f in listdir(file_path_3) if f.endswith('.xml')]

df_orig = pd.DataFrame(np.zeros((len(test_name), 2), dtype=object), columns=['file', 'annotation'])

for i in range(len(test_name)):
    tree = ET.parse(file_path_3 + test_name[i])
    root = tree.getroot()
    tag = [root[1][k][m].tag for k in range(len(root[1])) for m in range(len(root[1][k])) if root[1][k][m].attrib.keys().__contains__('text') == True]
    df_orig['annotation'][i] = np.unique(np.array(tag))
    df_orig['file'][i] = re.sub(r'\.xml','',test_name[i])

### Support Vector Machine

In [57]:
#support vector machine
svm = SGDClassifier(alpha=.00001, max_iter=100,penalty="elasticnet")
svm.partial_fit(x_train, y_train,classes=classes)

print (classification_report(y_pred=svm.predict(x_test), y_true=y_test, labels=classes))
#labels=classes means the performance evaluation is based on all the labels 
#labels=new_classes means the performance evaluation is based on all the labels except 'O'

              precision    recall  f1-score   support

         CAD       0.52      0.04      0.08      3951
           O       0.99      1.00      0.99    312776

    accuracy                           0.99    316727
   macro avg       0.75      0.52      0.54    316727
weighted avg       0.98      0.99      0.98    316727



### Naive Bayes

In [58]:
#naive bayes
nb = MultinomialNB(alpha=.0005)
nb.partial_fit(x_train, y_train,classes=classes)

print (classification_report(y_pred=nb.predict(x_test), y_true=y_test, labels=classes))
#labels=classes means the performance evaluation is based on all the labels 
#labels=new_classes means the performance evaluation is based on all the labels except 'O'

              precision    recall  f1-score   support

         CAD       0.45      0.05      0.09      3951
           O       0.99      1.00      0.99    312776

    accuracy                           0.99    316727
   macro avg       0.72      0.53      0.54    316727
weighted avg       0.98      0.99      0.98    316727



In [None]:
#random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=20,random_state=553)
rf.fit(x_train, y_train)

print (classification_report(y_pred=rf.predict(x_test), y_true=y_test, labels=classes))
#labels=classes means the performance evaluation is based on all the labels 
#labels=new_classes means the performance evaluation is based on all the labels except 'O'

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

print (classification_report(y_pred=lr.predict(x_test), y_true=y_test, labels=new_classes))
#labels=classes means the performance evaluation is based on all the labels 
#labels=new_classes means the performance evaluation is based on all the labels except 'O'

### Below you can train in different classifiers

In [None]:
#perceptron
per_no = Perceptron(verbose=10, n_jobs=-1, max_iter=20) #this is the perceptron model
per_no.partial_fit(x_without_o_train, y_without_o_train,classes=classes_without_o)

print (classification_report(y_pred=per_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))


In [None]:
#support vector machine
svm_no = SGDClassifier(alpha=.00001, max_iter=100,penalty="elasticnet")
svm_no.partial_fit(x_without_o_train, y_without_o_train,classes=classes_without_o)

print (classification_report(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))

print(precision_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(recall_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(f1_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(precision_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(recall_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(f1_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(precision_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))
print(recall_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))
print(f1_score(y_pred=svm_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))

In [None]:
#naive bayes
nb_no = MultinomialNB(alpha=0.01)
nb_no.partial_fit(x_without_o_train, y_without_o_train,classes=classes_without_o)

#打印分类报告，并保留四位小数
print (classification_report(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))

print(precision_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(recall_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(f1_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='micro'))
print(precision_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(recall_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(f1_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='macro'))
print(precision_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))
print(recall_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))
print(f1_score(y_pred=nb_no.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o, average='weighted'))


In [None]:
#random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=30,random_state=533)
rf.fit(x_without_o_train, y_without_o_train)

print (classification_report(y_pred=rf.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))



In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_without_o_train, y_without_o_train)

print (classification_report(y_pred=lr.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))

In [None]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_without_o_train, y_without_o_train)

print (classification_report(y_pred=dt.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))

In [None]:
#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_without_o_train, y_without_o_train)

print (classification_report(y_pred=knn.predict(x_without_o_test), y_true=y_without_o_test, labels=classes_without_o))

# stanford NER tagger result calculation
## just for calculating the accuracy of stanford NER tagger, not for training and testing of traditional machine learning classifiers

In [None]:
#y_test当中除去o的数目
total = y_test[y_test != 'O']
print(len(total))

TP = 2690
FP = 1810
FN = 3209
TN = total.size - TP - FP - FN
print(TP, FP, FN, TN)

#calculate the accuracy
accuracy = (TP + TN) / (TP + FP + FN + TN)
print(accuracy)

#calculate the precision
precision = TP / (TP + FP)
print(precision)

#calculate the recall
recall = TP / (TP + FN)
print(recall)

#calculate the F1 score
F1 = 2 * precision * recall / (precision + recall)
print(F1)



In [None]:
#read result.txt
path = 'E:/JHU/课程/datadesign/NLP/machine_learning/stanford-ner-4.2.0/stanford-ner-2020-11-17/test'

data = pd.read_csv(path + '/result_M40_N0_chris2useLC.txt', sep='\t', header=None, names=['Entity', 'Percision', 'Recall', 'F1', 'TP', 'FP', 'FN'])
#Drop the first row
data = data.drop([0])

print(data)
print('')

#Drop the last row
data = data.drop([len(data)])
data = data.reset_index(drop=True)


#convert the data type
data['Percision'] = data['Percision'].astype(float)
data['Recall'] = data['Recall'].astype(float)
data['F1'] = data['F1'].astype(float)
data['TP'] = data['TP'].astype(int)
data['FP'] = data['FP'].astype(int)
data['FN'] = data['FN'].astype(int)
#print(data)

#calculate the micro average
TP = data['TP'].sum()
FP = data['FP'].sum()
FN = data['FN'].sum()
micro_precision = TP / (TP + FP)
micro_recall = TP / (TP + FN)
micro_F1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)

#calculate the macro average
macro_precision = data['Percision'].mean()
macro_recall = data['Recall'].mean()
macro_F1 = data['F1'].mean()

#calculate the support
support = data['TP']
#print(support)
#add the support to the dataframe
data['support'] = support
support_proportion = support / support.sum()
#print(support_proportion)

#calculate the weighted average
weighted_precision = (data['Percision'] * support_proportion).sum()
weighted_recall = (data['Recall'] * support_proportion).sum()
weighted_F1 = (data['F1'] * support_proportion).sum()


Evalution = pd.DataFrame(columns=['Entity', 'Percision', 'Recall', 'F1', 'TP', 'FP', 'FN','support'])
Evalution.loc[len(Evalution)] = ['micro-average', micro_precision, micro_recall, micro_F1, TP, FP, FN, support.sum()]
Evalution.loc[len(Evalution)] = ['macro-average', macro_precision, macro_recall, macro_F1, TP, FP, FN, support.sum()]
Evalution.loc[len(Evalution)] = ['weighted-average', weighted_precision, weighted_recall, weighted_F1, TP, FP, FN, support.sum()]


print(data)
print('')
print(Evalution)
