In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

pd.options.mode.chained_assignment = None 

In [2]:
# prep the dataset

def clean_txt():
    
    #read the file
    opened  = open('/home/dustin/Python/Animacies/resources/sweetgrass.txt', 'r')
    doc = opened.read()
    
    return doc

# split into sentences and prepare column labels for dataframe

def split_txt_into_sentences(text):
    
    split = sent_tokenize(text)
    split = [x.strip() for x in split]
    
    frame_ready = []
    counter = 0
    for x in split:
        frame_ready.append([counter, x])
        counter += 1
    
    return frame_ready

sweetgrass = split_txt_into_sentences(clean_txt())

In [3]:
# create a tsv which organizes sentences containing a desired token into a BERT-readable format

def filter_df(df, token):
    
    filtered = df.loc[df['OriginalSentence'].str.contains('[^\w]'+token+'[^\w]', regex=True, case=False)]
    
    return filtered

def masks(string, target):
    
    new_string = string.lower()
    match = re.search('[^\w]'+target+'[^\w]', new_string)
    start_index = match.start() + 1
    end_index = new_string.find(target) + len(target)
    final_word = '{}'
    final_string = string[:start_index] + final_word + string[end_index:]
    
    return final_string

def assemble_bert_tsv(lst, word):
    
    # create dataframe and filter out sentences which do not contain target expression
    full_df = pd.DataFrame(lst, columns = ['SentenceId', 'OriginalSentence'])
    filtered_df = filter_df(full_df, word)
    filtered_df['MaskedSentence'] = filtered_df['OriginalSentence'].apply(masks, target=word)
    
    # add a column for the target expression
    count = len(filtered_df.index)
    targ_ex_col = [word] * count
    filtered_df['TargetExpression'] = targ_ex_col
    
    # add context sentences from original dataframe
    context = []
    for x in range(count):
        sentence = filtered_df.iloc[x, 1]
        sentence_id = filtered_df.iloc[x, 0]
        
        try:
            prev_sentence = full_df.iloc[sentence_id - 1, 1]
        except:
            prev_sentence = ''
            
        try:
            next_sentence = full_df.iloc[sentence_id + 1, 1]
        except IndexError:
            next_sentence = ''
            
        context.append(prev_sentence + " [SEP] " + sentence +  " [SEP] " + next_sentence)

    filtered_df['SentenceCtxt'] = context
    
    final_df = filtered_df[['SentenceId', 'TargetExpression', 'OriginalSentence', 'MaskedSentence','SentenceCtxt']]
    
    return final_df

word_list = ['beans', 'pole beans', 'squash', 'squash vines,' 'tomato plants', 'sunflowers', 'potatoes', 
             'plants', 'tobacco', 'sweetgrass', 'strawberry', 'corn', 'woodpile', 'land', 'the land',
             'blackberries', 'garden', 'species', 'peach', 'food plants', 'the earth', 'tomatillos',
             'chile', 'vegetables', 'corn leaves', 'pumpkins', 'pumpkin', 'corn seed', 'bean seed',
             'three sisters' ,'sister', 'sisters', 'squash stem', 'flower', 'apple', 'ear of corn',
             'corncob', 'corn kernels', 'the corn mother', 'pod']

empty_list = []

result = pd.DataFrame(empty_list, columns=['SentenceId', 'TargetExpression', 'OriginalSentence', 
                                           'MaskedSentence', 'SentenceCtxt'])

for word in word_list:
    new_result = assemble_bert_tsv(sweetgrass, word)
    result = pd.concat([result, new_result])

result = result.reset_index(drop=True)
result.to_csv('/home/dustin/Python/Animacies/filtered/sweetgrass.tsv', sep='\t')

result.head(5000)

Unnamed: 0,SentenceId,TargetExpression,OriginalSentence,MaskedSentence,SentenceCtxt
0,0,beans,"It came to me while picking beans, the secret ...","It came to me while picking {}, the secret of ...",This is how the world keeps going. [SEP] It ca...
1,1,beans,I was hunting among the spiraling vines that e...,I was hunting among the spiraling vines that e...,"It came to me while picking beans, the secret ..."
2,10,beans,Seeds for this basket of beans were poked into...,Seeds for this basket of {} were poked into th...,"They complain about garden chores, as kids are..."
3,17,beans,"From her breasts grew corn, from her belly the...","From her breasts grew corn, from her belly the...",Her heart gave us the strawberry. [SEP] From h...
4,28,beans,"Maybe it was the smell of ripe tomatoes, or th...","Maybe it was the smell of ripe tomatoes, or th...",Each in our own way by a shower of gifts and a...
...,...,...,...,...,...
297,352,corn kernels,Only when the corn kernels are so fertilized w...,Only when the {} are so fertilized will they g...,The corn sperm swim down the silken cube to th...
298,354,the corn mother,Is it any wonder she is called the Corn Mother?,Is it any wonder she is called {}?,"A corncob is the mother of hundreds, as many c..."
299,357,pod,"I ask them to first open a slender pod, to see...","I ask them to first open a slender {}, to see ...",The students are contentedly munching fresh po...
300,358,pod,Jed slits a pod with his thumbnail and opens it.,Jed slits a {} with his thumbnail and opens it.,"I ask them to first open a slender pod, to see..."
