In [5]:
# Imports from python libraries
import numpy as np
import os
import pandas as pd
import string
import sys
from whoosh.index import open_dir

# Imports from own script
from baseline_search import create_searchable_data2

# Define paths
root = os.path.join(os.sep,"media","sf_MartinedeVos")
search_dir = os.path.join(os.sep,root,"TargetSize150","text_preserve_paragraph")
indexdir = os.path.join(os.sep,search_dir,"indexdir")

**1. Select folder with text fragments in the zip folder on surfdrive:**

../Data/NR-teksts/EviDENce_NR_output/TargetSize100/Lemma_preserve_paragraph.zip

*NB: The file names are long, and so is the path. Make sure to extract the zip folder on high-level location on your computer to avoid "path-too-long" error*

**2. Index all documents (i.e., lemma fragments) in the directory**

* Create Schema
* Add documents
* Perform indexing

_NB: this step only has to be run once, or when data is added or changed_

In [6]:
# The creation of an index is only needed once; after that, opending the existing index is sufficient
# in that case, the following line should be commented out

create_searchable_data2(search_dir)

**3. Store required data for manual annotation **

* Store data of all fragments in corpus in a dataframe
* Take a random sample
* Store the sample, i.e., title+text, in a csv file

In [None]:
ix = open_dir(indexdir)

with ix.searcher() as searcher:
    index_dic = {doc['title']:[doc['textdata']] for doc in searcher.all_stored_fields()}   

index_df = pd.DataFrame.from_dict(index_dic, orient='index')   

In [8]:
random_state = 1111
sample = index_df.sample(n=100, random_state=random_state)

**Store file set used for manual annotation**

* Based on the sample created above, i.e., csv file with for every fragment title+text
* Store for every textfragment _mentioned_ in the csv file the corresponding _actual_ lemma-based fragment in a separate folder
* The automatic analyses, i.e., keyword search and machine learning, can be performed on the fragments in that folder

In [9]:
import shutil
import re

sample_file = "manual_annotation_sample_seed1111.csv"
manual_annotation_sample = os.path.join(root,"surfdrive","Projects", "EviDENce","Data",sample_file)

# Collect titles in sample from stored csv file NB these are files with text and not lemmas
mannotate_df = pd.read_csv(manual_annotation_sample,sep=',',encoding = "ISO-8859-1")

source = os.path.join(os.sep,root,"TargetSize150","lemma_preserve_paragraph")
destination = os.path.join(os.sep,root,"TargetSize150","mannotate_lemma_sample1111")
delimiter = '_clipped'

for title in mannotate_df['Titel'].iloc[:-1].tolist():
    # Recreate folder name from file title
    fol,rest = title.split(delimiter)
    fol = fol + delimiter
    # Adapt title to point to lemma -and not text-file
    file = title.replace("text","lemma.txt")
    srcpath = os.path.join(os.sep,source,fol,file)
    destpath = os.path.join(os.sep,destination)
    # Copy file to destination folder
    shutil.copy(srcpath, destpath)