# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import json
from collections import Counter
import random
import sys
from joblib import dump, load

import scipy.stats

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.linear_model import perceptron, SGDClassifier, PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

# Load Data

In [2]:
text = pd.read_csv("Data/query1_2-manualCurate.csv")["title_abstract"]
# text2 = pd.read_csv("Data/query2_manualCurate.csv")["title_abstract"]
# text = text[0:300]
#text = pd.concat([text[0:300],text2]).reset_index(drop=True)

# Process Data

In [3]:
# Define a function to process the data
def processText(text):
    # Get abstracts into one continuous string
    text = text.str.cat()
    # Tokenize the string object by word
    text = word_tokenize(text)
    # Remove stop words
    stop = set(stopwords.words('english'))
    text = [w for w in text if not w in stop]
    # Tag each word by the appropriate part of speech (POS) tag
    text = pos_tag(text)
    # Reshape the data into a dataframe
    text = pd.DataFrame(text, columns=['word','POS'])
    return(text)

In [4]:
df = processText(text)

# Tag text

In [5]:
# We want to detect metabolomics software tools, so we will tag some tools
tags = pd.read_csv("Data/CuratedTools.csv")
toolsToTag = tags.CuratedTools.tolist()

In [6]:
# Tag the training data
df["label"] = ["T" if x in toolsToTag else "O" for x in df.word]

# Identify sentences

In [7]:
# Define a function that identifies where sentences begin/end
def identSentence(textDf):
    # start at sentence 1
    n_sent = 1
    sents = [] # init empty array to wholed sentence identifiers
    # Loop through text incrementing n_sent after each period
    for word in textDf.word:
        if word == ".":
            sents.append(n_sent)
            n_sent += 1
        else:
            sents.append(np.nan) # If we are still before the end of the sentence label it as NA
    textDf['Sent_id'] = sents # Generate a column of the sentences 
    textDf['Sent_id'] = textDf['Sent_id'].bfill() # back fill the NAs to get the correct sentence IDs
    return(textDf)

In [8]:
df = identSentence(df)

In [9]:
df.head()

Unnamed: 0,word,POS,label,Sent_id
0,MetExplore,NN,T,1.0
1,:,:,O,1.0
2,collaborative,JJ,O,1.0
3,edition,NN,O,1.0
4,exploration,NN,O,1.0


# Get Sentences

In [10]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,p,t) for w, p, t in zip(s['word'].values.tolist(),
                                                         s['POS'].values.tolist(),
                                                         s['label'].values.tolist())]
        self.grouped = self.data.groupby('Sent_id').apply(agg_func)
        self.sentences = [s for s in self.grouped]
    def get_next(self):
        try:
            s = self.grouped['Sentence: {}:'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# Build CRF Features

In [11]:
def camel(word):
    return word != word.lower() and word != word.upper() and word.istitle() != True and "_" not in word

In [12]:
def word_features(sent, i):
        word = sent[i][0]
        postag = sent[i][1]
        
        features = {
            'bias': 1.0,
            'word': word,
            #'word[-3:]': word[-3:],
            #'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'num_upper_chars': sum(map(str.isupper, word)),
            'camelCase': camel(word),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'postag': postag,
            #'postag[:2]': postag[:2],
        }
        if i > 0:
            word1 = sent[i-1][0]
            postag1 = sent[i-1][1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:postag': postag1,
                #'-1:postag[:2]': postag1[:2],
            })
        else:
            features['BOS'] = True
        if i < len(sent)-1:
            word1 = sent[i+1][0]
            postag1 = sent[i+1][1]
            features.update({
                '+1word.lower()': word1.lower(),
                '+1word.istitle()': word1.istitle(),
                '+1word.isupper()': word1.isupper(),
                '+1postag': postag1,
                #'+1postag[:2]': postag1[:2],
            })
        else:
            features['EOS'] = True
        return features

def sent_features(sent):
    return [word_features(sent, i) for i in range(len(sent))]

def sent_labels(sent):
    return [label for token, postag, label in sent]

def sent_tokens(sent):
    return [token for token, postag, label in sent]

# Implement CRF

In [13]:
# Split into sentences
getter = SentenceGetter(df)
sentences = getter.sentences
X = [sent_features(s) for s in sentences]
Y = [sent_labels(s) for s in sentences] 
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=555, shuffle=False)

In [14]:
#Train a CRF model
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.1,
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_true = y_test, y_pred = y_pred, labels = ['T']))

              precision    recall  f1-score   support

           T       0.82      0.55      0.66       261

   micro avg       0.82      0.55      0.66       261
   macro avg       0.82      0.55      0.66       261
weighted avg       0.82      0.55      0.66       261



