In [1]:
# Imports from python libraries
import numpy as np
import os
import pandas as pd
import string
import sys
from whoosh.index import open_dir

# Imports from own script
from baseline_search import create_searchable_data

# Define paths
root = os.path.join(os.sep,"media","sf_MartinedeVos")
search_dir = os.path.join(os.sep,root,"TargetSize150","text_preserve_paragraph")
indexdir = os.path.join(os.sep,search_dir,"indexdir")

**1. Select folder with text fragments in the zip folder on surfdrive:**

../Data/NR-teksts/EviDENce_NR_output/TargetSize100/Lemma_preserve_paragraph.zip

*NB: The file names are long, and so is the path. Make sure to extract the zip folder on high-level location on your computer to avoid "path-too-long" error*

**2. Index all documents (i.e., lemma fragments) in the directory**

* Create Schema
* Add documents
* Perform indexing

_NB: this step only has to be run once, or when data is added or changed_

In [2]:
# The creation of an index is only needed once; after that, opending the existing index is sufficient
# in that case, the following line should be commented out

create_searchable_data(search_dir)

**3. Store required data for manual annotation **

* First store in dataframe
* Create random sample
* Store sample in csv

In [3]:
ix = open_dir(indexdir)

with ix.searcher() as searcher:
    index_dic = {doc['title']:[doc['textdata']] for doc in searcher.all_stored_fields()}   

# Store document information in pandas dataframe
index_df = pd.DataFrame.from_dict(index_dic, orient='index')    

In [4]:
random_state = 1111
sample = index_df.sample(n=100, random_state=random_state)

In [5]:
# Store dataframe in csv together with seed value for reproduci
sample_file = "manual_annotation_sample_seed{}.csv".format(random_state)

manual_annotation_sample = os.path.join(root,"surfdrive","Projects", "EviDENce","Data",sample_file)
sample.to_csv(manual_annotation_sample)