In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

pd.options.mode.chained_assignment = None 

In [2]:
# prep the dataset

def clean_txt():
    
    #read the file
    opened  = open('/home/dustin/Python/Animacies/resources/bryophytes.txt', 'r')
    doc = opened.read()
    
    return doc

# split into sentences and prepare column labels for dataframe

def split_txt_into_sentences(text):
    
    split = sent_tokenize(text)
    split = [x.strip() for x in split]
    
    frame_ready = []
    counter = 0
    for x in split:
        frame_ready.append([counter, x])
        counter += 1
    
    return frame_ready

bryophytes = split_txt_into_sentences(clean_txt())

In [3]:
# create a tsv which organizes sentences containing a desired token into a BERT-readable format

def filter_df(df, token):
    
    filtered = df.loc[df['OriginalSentence'].str.contains('[^\w]'+token+'[^\w]', regex=True, case=False)]
    
    return filtered

def masks(string, target):
    
    new_string = string.lower()
    match = re.search('[^\w]'+target+'[^\w]', new_string)
    start_index = match.start() + 1
    end_index = new_string.find(target) + len(target)
    final_word = '{}'
    final_string = string[:start_index] + final_word + string[end_index:]
    
    return final_string

def assemble_bert_tsv(lst, word):
    
    # create dataframe and filter out sentences which do not contain target expression
    full_df = pd.DataFrame(lst, columns = ['SentenceId', 'OriginalSentence'])
    filtered_df = filter_df(full_df, word)
    filtered_df['MaskedSentence'] = filtered_df['OriginalSentence'].apply(masks, target=word)
    
    # add a column for the target expression
    count = len(filtered_df.index)
    targ_ex_col = [word] * count
    filtered_df['TargetExpression'] = targ_ex_col
    
    # add context sentences from original dataframe
    context = []
    for x in range(count):
        sentence = filtered_df.iloc[x, 1]
        sentence_id = filtered_df.iloc[x, 0]
        
        try:
            prev_sentence = full_df.iloc[sentence_id - 1, 1]
        except:
            prev_sentence = ''
            
        try:
            next_sentence = full_df.iloc[sentence_id + 1, 1]
        except IndexError:
            next_sentence = ''
            
        context.append(prev_sentence + " [SEP] " + sentence +  " [SEP] " + next_sentence)

    filtered_df['SentenceCtxt'] = context
    
    final_df = filtered_df[['SentenceId', 'TargetExpression', 'OriginalSentence', 'MaskedSentence','SentenceCtxt']]
    
    return final_df

word_list = ['bryophyte', 'bryophytes', 'species', 'spores', 'polytrichum ohioense', 'p ohioense', 
             'dicranella heteromalla', 'd heteromalla', 'atrichum angustatum', 'a angustatum', 
             'diphyscium foliosum', 'd foliosum', 'pogonatum pensylvanicum', 'p pensylvanicum', 
             'mosses', 'gametophyte', 'colony', 'propagule', 'vegetative fragments', 'vegetation', 
             'colonies']

empty_list = []

result = pd.DataFrame(empty_list, columns=['SentenceId', 'TargetExpression', 'OriginalSentence', 
                                           'MaskedSentence', 'SentenceCtxt'])

for word in word_list:
    new_result = assemble_bert_tsv(bryophytes, word)
    result = pd.concat([result, new_result])

result = result.reset_index(drop=True)
result.to_csv('/home/dustin/Python/Animacies/filtered/bryophytes.tsv', sep='\t')

result.head(5000)

Unnamed: 0,SentenceId,TargetExpression,OriginalSentence,MaskedSentence,SentenceCtxt
0,2,bryophyte,The goals of this paper are to 1) determine th...,The goals of this paper are to 1) determine th...,Bryophyte communities on treefall mounds prese...
1,40,bryophyte,Dispersal and establishment have been suggeste...,Dispersal and establishment have been suggeste...,This study compares the establishment success ...
2,44,bryophyte,"2000, Hedderson 1992, Marino 1991, Ross-Davis ...","2000, Hedderson 1992, Marino 1991, Ross-Davis ...",While dispersal and establishment have been po...
3,50,bryophyte,Treefall mounds have been shown to increase an...,Treefall mounds have been shown to increase an...,Treefall gaps increase microhabitat heterogene...
4,55,bryophyte,The goals of this paper are to 1) describe the...,The goals of this paper are to 1) describe the...,Since dispersal may occur through space by new...
...,...,...,...,...,...
203,58,vegetation,"The stands are similar in soil type, climate a...","The stands are similar in soil type, climate a...","The stands are all second-growth, closed-canop..."
204,99,colonies,Fragment inocula were prepared by gathering ve...,Fragment inocula were prepared by gathering ve...,The five species and two propagule types are r...
205,183,colonies,Mean 1998 cover of colonies which persisted un...,Mean 1998 cover of {} which persisted until 19...,Persistence was largely dependent upon cover a...
206,184,colonies,Those larger colonies which did survive genera...,Those larger {} which did survive generally ex...,Mean 1998 cover of colonies which persisted un...
