# This is a CRF model for cutting summaries that are not divided into four sections.

## execute this cell first


class to retrieve the sentences from the dataset

In [None]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["sentence"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("pmid").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["pmid : {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None



## read the data
Note : to run the CRF the training data must be annotated, the file below is available

In [None]:
annotated = pd.read_csv('all_annotated.csv')
data = annotated.dropna()
getter = getsentence(data)
sentences = getter.sentences

## utils functions

In [None]:
def sentense_number(abstract, token):
  blob = TextBlob(abstract)
  sous_sentenses = []
  for a in blob.sentences:
    sous_sentenses.append(str(a))

  num = sous_sentenses.index(token)+1
  return num

def sentence_position(text,token):
  indice = 0
  joker = 0
  lsx = text.split()
  chunk_size = int(len(lsx)/10)
  #print(chunk_size)
  output = [lsx[i:i+chunk_size] for i in range(0, len(lsx), chunk_size)]
  liste_des_parties = []
  for ab in output:
    part = ''
    for text in ab:
      part = part + ' ' + text
    liste_des_parties.append(part.lstrip())

  #print(liste_des_parties)
  first3 = ''
  last3 = ''
  for ele in token.split()[:3]:
    first3 = first3 + ' ' + ele
  for ele in token.split()[-3:]:
    last3 = last3 + ' ' + ele
  
  token_head = first3.lstrip()
  token_tail = last3.lstrip()
  #print(token_head)
  #print(token_tail)
  #retrunn the index of the part that contains the sentence, If after cutting a sentence straddles 2 sections, we take the max index.
  for part in liste_des_parties:
    if token in part:
      indice = liste_des_parties.index(part)+1
    elif token_tail in part:
      indice = liste_des_parties.index(part)+1
    elif token_head in part:
      joker = liste_des_parties.index(part)+1
      

  if indice == 0:
    indice = joker
  return indice

## Features: 
this section is for building features for the model

In [None]:
# the lists below represent the 20 most frequent words in the four parts (introduction, method, result, conclusion).
Ilist = ['intervention','exercise','increase','life','quality','physical','examine','health','reduce','aim','program','care','effectiveness',
'interventions','activity','fatigue','group','dietary','information', 'distress']
Mlist = ['exercise','health','outcome','physical','pain','fatigue','diet','anxiety','depression','experimental',
'body','hospital','medical','participate','blood','model','dietary','practice','vitamin', 'symptoms']
Rlist = ['group','intervention','increase','compare','effect','score','participants','change','baseline','difference','show',
'follow','time','mean','decrease','level','lower','higher','find','exercise']
Clist = ['health','suggest','benefit','decrease','supplementation','future','research','nurse','enhance','feasible',
'dietary','appear','distress','potential','diet','psychological','weight','beneficial','aerobic','approach']

#a function that acts on lists
def get_token_voca(token):
  i = sum(el in token.split() for el in Ilist)
  m = sum(el in token.split() for el in Mlist)
  r = sum(el in token.split() for el in Rlist)
  c = sum(el in token.split() for el in Clist)
  if i>m and i>r and i>c:
    return Ilist
  elif m>i and m>r and m>c:
    return Mlist
  elif r>i and r>m and r>c:
    return Rlist
  elif c>i and c>m and c>r:
    return Clist
  else:
    return 0


pLength, pRelLength = 0,0
nLength, nRelLength = 0,0
#les premiers mots de la phrase précédente (p0, p1)
#la longueur absolue de phrase précédente (pLength)
#la longueur relative de phrase précédente (pRelLength)
#la longueur absolue de phrase suivante (nLength)
#la longueur relative de phrase suivante (nRelLength)

# main Function to build features for the CRF
def sentencefeatures(sentences,i):
  
  token = sentences[i][0].lstrip()

  features = {
      'token': token,
      'first_3_words': token.split()[:3],
      'absLength': len(token.split()),
      'lexique' : get_token_voca(token), 
      #'relLength': len(token.split())/len(abstract.split()),
      #'absNum' : sentense_number(abstract, token),
      #'relNum' : sentence_position(abstract,token)            
              
  }
  
  if i > 0:
    previous_token = sentences[i-1][0].lstrip()
    features.update({
        'firstP_2_words': previous_token.split()[:2],
        'pLength': len(previous_token.split()),
        #'pRelLength': len(previous_token.split())/len(abstract.split())
        })
  
  if i < len(sentences)-1:
    next_token = sentences[i+1][0].lstrip()
    features.update({
        'firstN_2_words': next_token.split()[:2],
        'nLength': len(next_token.split()),
        #'nRelLength': len(next_token.split())/len(abstract.split())
        })

  return features

def sent2features(sent):
    return [sentencefeatures(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for sentence,tag in sent]

## Training CRF

In [None]:
pip install sklearn_crfsuite

In [None]:
pip install eli5

In [None]:
pip install python-crfsuite

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer

In [None]:
X = [sent2features(phrase) for phrase in sentences]
y = [sent2labels(phrase) for phrase in sentences] #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
"""
!!! Note :  if there is an error about fitting the data , please check the output of the function get_token_voca(token),
specially this feature (  lexique' : get_token_voca(token) ) has a bad data input like None or NoType..'
"""
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # options: 'l2sgd', 'lbfgs', 
    c1=0.0418,           # 0.015, # not applicable for 'l2sgd'
    c2=0.00056,          # 0.0037,
    max_iterations=100, #100,
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

In [None]:
pred = cross_val_predict(estimator=crf, X=X_test, y=y_test, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y_test)
print(report)

## Visualisation
display the most important functionalities for your model (here we have chosen 30), also the transitions between the sections.

In [None]:
import eli5
eli5.show_weights(crf, top=30)

### - we show here for each functionality its importance in relation to all the sections.
### - to view all the features, please put the first word of each feature in the variable "**feature_re**" and **execute** as long as the number of features in the model.

In [None]:
eli5.show_weights(crf, top=10, feature_re='^l',
                  horizontal_layout=False, show=['targets'])

# Annotate text

**Load crf model**

In [None]:
import joblib

#Saving Model
filename = 'CRFsectionsannotation.sav'

#load the model
joblib.dump(crf, filename)

**Annotate abstract with, Introduction, Methode, resuts, conclusion.**

In [None]:
kayla = pd.read_csv("kayla.csv")
def tag(sentence):
    sentence_features = [sentencefeatures(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, crf.predict([sentence_features])[0]))

In [None]:
listA = []
i = 'introduction : '
m = 'methods : '
r = 'results :'
c = 'conclusion :'

for Abstract in kayla['text']:
  sentence =  str(Abstract).split('.')
  sentence.pop()
  sentenceT = tag(sentence)
  completeI = ""
  completeM = ""
  completeR = ""
  completeC = ""
  for index in range(len(sentenceT)):
    if sentenceT[index][1] == 'introduction':
      completeI = completeI + " " + sentenceT[index][0]
    if sentenceT[index][1] == 'methods':
      completeM = completeM +  " " + sentenceT[index][0]
    if sentenceT[index][1] == 'results':
      completeR = completeR + " " + sentenceT[index][0]
    if sentenceT[index][1] == 'conclusion':
      completeC = completeC + " " + sentenceT[index][0]

  introduction = completeI.lstrip()
  introduction= "\n".join([i, introduction])
  
  methods =  completeM.lstrip()
  methods= "\n".join([m, methods])

  resulo = completeR.lstrip()
  resulo= "\n".join([r, resulo])

  conclusion  = completeC.lstrip()
  conclusion= "\n".join([c, conclusion])

  final = "\n".join([introduction, methods, resulo, conclusion])
  listA.append(final)


**get the file that contains the devided abstact**

In [None]:
devided_sections = pd.DataFrame(listA, columns=['text_cutted'])
Final= pd.concat([kayla, devided_sections], axis=1, sort=False)
Final.to_csv('kayla_final_devided.csv')