In [1]:
import numpy as np
import os
import pandas as pd
import string
import sys

from whoosh import scoring
from whoosh import qparser

Index all documents (i.e., text fragments) in the directory
* Create Schema
* Add documents
* Perform indexing

NB: this step only has to be run once, or when data is added or changed

In [2]:
from baseline_search import create_searchable_data

In [3]:
# Provide paths to (NB: double nested) documents folder
home = "media/sf_MartinedeVos/surfdrive/Projects/EviDENce/"
search_dir = os.path.join(os.sep,home,"Data","NR-Teksts","EviDENce_NR_output","Size200","fragmented_lemmas")

In [4]:
create_searchable_data(search_dir)

Collect list of keywords from CEO-ECB mappings:
* remove pos-tag from each mention
* translate remaining lemma to Dutch 

In [5]:
from baseline_search import create_lemma_list
from baseline_search import eng_to_dutch_list

In [6]:
# Provide paths to (NB: double nested) documents folder and mapping file
mention_file = os.path.join(os.sep,home,"Analyses","MdV_selectedCEOECB.csv")

In [7]:
en_mentions = create_lemma_list(mention_file)

In [None]:
# This is a workaround as google translate API is not working 
#(JSONCODECError;?problem with VirtualBox and network conection?)
for i in en_mentions:
    string = ' '.join(i)
    print(string)

prefab_file = os.path.join(os.sep,home,"Analyses","nl_mentions.csv")
prefab_mentions = pd.read_csv(prefab_file,sep=';',encoding = "ISO-8859-1")
nl_mentions = list(prefab_mentions["Mention"])

for en,nl in zip(en_mentions, nl_mentions):
    print(en,'->',nl)

To do: 
* run lemmas on docs
* try out scoring and ranking
* plot violence occurence over fragments (x: # hits, y: # documents)
* plot types of violence pver fragments

Preprocess list of keywords to be used in search engine:
* sort
* concatenate in one query string

In [None]:
nl_mention_set = set(nl_mentions)
my_list = list(nl_mention_set)

my_quoted_list = ['"' + item + '"' for item in my_list]

nl_mention_query = " ".join(my_quoted_list)
nl_mention_query

In [10]:
from whoosh.index import open_dir

indexdir = os.path.join(os.sep,search_dir,"indexdir")
print(indexdir)
ix = open_dir(indexdir)

/media/sf_MartinedeVos/surfdrive/Projects/EviDENce/Data/NR-Teksts/EviDENce_NR_output/Size200/fragmented_lemmas/indexdir


For given query display:
* number of relevant documents overall:
    * hit vs no-hit
    * hits per keyword

In [None]:
import pandas as pd
from whoosh import qparser
from whoosh.index import open_dir

indexdir = os.path.join(os.sep,search_dir,"indexdir")
print(indexdir)
ix = open_dir(indexdir)

parser = qparser.QueryParser("content", schema=ix.schema,group=qparser.OrGroup)
my_query = parser.parse(nl_mention_query)

rows_list = []
titles_list = []

with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
    results = searcher.search(my_query,limit=None, terms = True)
    for res in results:
        titles_list.append(res["title"])
        row_dict = {key:0 for key in my_quoted_list}
        hits = [term.decode('utf8')  for where,term in res.matched_terms()]
        for hit in hits:
            # Could this be prettier? defaultdict does not allow custom-set keys, and dict.get()
            # seems to change all other values into NaN if one of them has a hit
            if hit in row_dict:
                row_dict[hit]+= 1
            else:
                pass
        rows_list.append(row_dict)


df = pd.DataFrame(rows_list)

df['title'] = titles_list
cols = df.columns.tolist()
df = df[[cols[-1]] + cols[:-1]]
df

In [46]:
df.fillna(0)
df['Total hits']=df.iloc[:,1:-1].sum()

Unnamed: 0,title,"""Doodslag""","""Oorlog""","""Plat branden""","""Rust in vrede""","""aanval""","""aanvallend""","""alarm""","""arm""","""arresteren""",...,"""vuur aansteken""","""vuur op""","""wijs een geweer""","""wond""","""worstelen""","""zeeroof""","""zelfmoord""","""zelfmoordpoging""","""zelfverdediging""",Total hits
0,GV_Traktor_Russenoorlog_03_conversation_clippe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GV_SiteFilms_Java_03_conversation_clipped_200_85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GV_Traktor_Russenoorlog_07_conversation_clippe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GV_Traktor_Russenoorlog_01_conversation_clippe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GV_Zigma_koopvaardij_06_conversation_clipped_2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,GV_Zigma_koopvaardij_06_conversation_clipped_2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,GV_Verhalis_kloosterzusters_02b_conversation_c...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,GV_Zigma_koopvaardij_04_conversation_clipped_2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,GV_Schiedam_gastgezin2_09_conversation_clipped...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,GV_Zigma_koopvaardij_01_conversation_clipped_2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df['Total hits']=df.iloc[:,1:-1].sum()