In [None]:
from elasticsearch import Elasticsearch

import time

import pandas as pd
import plotly.express as px

In [None]:
# Elastic host
es = Elasticsearch(
    hosts=[
            "https://localhost:9200"
    ],
    basic_auth=("elastic", "NES9DZ-QwhanXAQf9caV"),
#     use_ssl=True,
#     verify_certs=False,
    ca_certs="./ca.crt"
)

In [None]:
# Word counts df
word_count_csv = 'C:/Users/Asher/Documents/School/_Scriptie/Data/xml_word_counts.csv'

df_word_count = pd.read_csv(word_count_csv)

display(df_word_count)

test_count = df_word_count[df_word_count["filename"] == 'ParlaMint-BE_2014-06-19-54-plenair-ip001x.xml']["words"].values[0]
print(test_count)

In [None]:
# NOTE: input is {search_key: value, search_key2: value2}
# query function
def query(search_dict):
    processed_search_list = []
    
    # loop door alle search elements heen
    for k, v in search_dict.items():
        processed_search_list.append({"match_phrase" : {k : v}})
        
    # stel de uitkomst samen
    result = es.search(
    index = "search",
    size = 10000, # TODO: Zorg dat er een groter limit is dan 10000
    query = {
        "bool" : {
            "must": processed_search_list,},})
    
    return result

In [None]:
# query testruimte
start_time = time.time()

# de query zelf
# test_result = query({"content_simplified":"ete", 
#                      "person_simplified":"FreDeriC BARBIER", 
#                      "country":"FR"})

# test_result = query({"content_simplified":"groen",
#                      "party":"SP",
#                      "year":"2018"})

test_result = query({"content_simplified":"democratie",
                    "segment":"ParlaMint-NL_2017-02-08-tweedekamer-10.u200"})

print("Got %d Hits:" % test_result['hits']['total']['value'])

for hit in test_result['hits']['hits']:
    print("%(person)s (%(party)s) \n(%(year)s-%(month)s-%(day)s) %(segment)s:\n %(content)s \n" % hit["_source"])

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def get_file_name(file):
    file = file.split('.')[0]+'.xml'
        
    if 'FR' in file:
        file = file.split('_u')[0]+'.xml'
    
    return file

In [None]:
def get_word_percentage(row):
    word_total = df_word_count[df_word_count["filename"] == row["file"]]["words"].values[0]
    return (row["percentage"] / word_total) * 100

In [None]:
# neemt een query als input en geeft een df terug voor plotly express
def get_ngram_df(query_input, query_result, timespan="years"):
    
    query_input_words = len(query_input.split(" "))

    df = pd.DataFrame({'date': [],
                      'percentage': [],
                      'country': [],
                      'file': []})
    
    # loop door elk resultaat
    for hit in query_result:
        src = hit["_source"]
        
        # verschillende weergaven
        if timespan == "years":
            date = "{}-{}-{}".format(src["year"], 1, 1)
        elif timespan == "months":
            date = "{}-{}-{}".format(src["year"], src["month"], 1)
        else:
            date = "{}-{}-{}".format(src["year"], src["month"], src["day"])
            
        file = get_file_name(src["segment"])
        
        df.loc[len(df.index)] = [date,
                                 (query_input_words * (src["content_simplified"].count(query_input))),
                                 src["country"],
                                 file]
    
    # krijg het totaal aantal hits binnen 1 bestand van de zoekterm
    df["percentage"] = df.groupby("file")["percentage"].transform("sum")

    # schoon de df weer op
    df = df.drop_duplicates(["date", "percentage", "country", "file"])
    df = df.loc[df["percentage"] != 0]
    
    # reken per document het ngram percentage uit
    df["percentage"] = df.apply(get_word_percentage, axis=1)
    
    # voeg alle percentages op dezelfde dag samen
    df["percentage"] = df.groupby(["country", "date"])["percentage"].transform("sum")
    
    # ga elke date langs om het totale percentage te delen door het aantal files vand de dag
    countries = list(dict.fromkeys(df["country"].tolist()))
    
    # deel per land en date de percentage som door het aantal documenten op de datum
    for country in countries:
        dates = list(dict.fromkeys(df[df["country"] == country]["date"].tolist()))
        
        for date in dates:
            
            # deel het totale percentage door het totale aantal rows (van een date en country)
            count = len(df[(df["date"] == date) & (df["country"] == country)])
            df.update(df.loc[(df["country"] == country) & (df["date"] == date)]["percentage"].apply(lambda x: (x / count)))
    
    # gooi de dupes eruit
    df = df.drop_duplicates(["date", "country"]).drop("file", axis=1)
    
    return df.sort_values(by=["date"])

In [None]:
test_term = "democratie"

test_newgram = query({"content_simplified":test_term})

test_ngram_df = get_ngram_df(test_term, test_newgram['hits']['hits'])

# display(test_ngram_df)

fig = px.line(test_ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{test_term}"')

fig.show()

In [None]:
test_term = "democratie"

test_newgram = query({"content_simplified":test_term, "year":"2018"})

test_ngram_df = get_ngram_df(test_term, test_newgram['hits']['hits'], "months")

# display(test_ngram_df)

fig = px.line(test_ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{test_term}"')

fig.show()

In [None]:
test_term = "democratie"

test_newgram = query({"content_simplified":test_term, "year":"2018", "month":"07"})

test_ngram_df = get_ngram_df(test_term, test_newgram['hits']['hits'], "days")

# display(test_ngram_df)

fig = px.line(test_ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{test_term}"')

fig.show()