In [None]:
import time
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import plotly.express as px

from elasticsearch import Elasticsearch

In [None]:
# Elastic host
es = Elasticsearch(
    hosts=[
            "https://localhost:9200"
    ],
    http_auth=("elastic", "NES9DZ-QwhanXAQf9caV"),
#     use_ssl=True,
    verify_certs=False,
#     ca_certs="./ca.crt"
)

In [None]:
# term query: https://www.getargon.io/docs/articles/elasticsearch/exact-text-search.html

# query ngram function
def query_ngram(search_dict):
    processed_search_list = []
    
    # loop door alle search elements heen
    for k, v in search_dict.items():
        
        # filter de ngram voor de term query
        if k is not "ngram":
            processed_search_list.append({"match_phrase" : {k : v}})
            
        else:
            ngram = {"term": {"ngram.keyword" : v}}
        
    # stel de uitkomst samen
    result = es.search(
    index = "ngrams",
    size = 10000, # TODO: Zorg dat er een groter limit is dan 10000
    query = {
        "bool": {
            "must": processed_search_list,
            "filter": ngram
        }
    })
    
    return result

In [None]:
# geeft een ngram df
# NOTE: query["country"] = list(), normaal is dat een string
def create_ngram(query, timespan="years"):
    countries = ["BG", "CZ", "DK", "NL", "SI", "GB"]
    
    # return dataframe
    df = pd.DataFrame({'date': [],
                       'percentage': [],
                       'country': []
                      })
    
    # kijk door welke landen er moet worden gezocht
    if "country" in query.keys():
        countries = query["country"]
        query.pop("country")
    
    # voeg result toe aan df en format timespan
    def append_to_df(date_format):
        results_total = 0
    
        # loop door de antwoorden per land
        for country in countries:
            query_country = query["country"] = country
            
            results = query_ngram(query)
            
            results_total = results_total + results["hits"]["total"]["value"]
            
            for result in results["hits"]["hits"]:
                src = result["_source"]

                date = date_format.format(src["year"], src["month"], src["day"])

                df.loc[len(df.index)] = [date, src["percentage"], src["country"]]

        return results_total
    
    # timespan formatting
    if timespan == "years":
        res = append_to_df("{}-1-1")       
        
    elif timespan == "months":
        res = append_to_df("{}-{}-1")
        
    else:
        res = append_to_df("{}-{}-{}")
    
    # combineer alle dates
    df["count"] = df["country"]

    df = df.groupby(["country", "date"]).agg({
        'percentage' : np.sum,
        "count": np.size
    }).reset_index()

    df['percentage'] /= df['count']
    df = df.drop(['count'], axis=1)
        
    return df.sort_values(by=["date", "country"]), res

In [None]:
timer = time.time()

# instellingen
ngram_term = "russia"
timespan = "years"

ngram_input = {"ngram" : ngram_term}

# maak df en plot
ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)
fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
timer = time.time()

# instellingen
ngram_term = "ukraine"
timespan = "years"

ngram_input = {"ngram" : ngram_term}

# maak df en plot
ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)
fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
timer = time.time()

# instellingen
ngram_term = "democracy"
timespan = "years"

ngram_input = {"ngram" : ngram_term,
              "country": ["NL"]}

# maak df en plot
ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)
fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
# keyword query test
ngram_input = {"ngram" : "would have",
               "country": "NL"}
result = query_ngram(ngram_input)

print("Hits: ", result["hits"]["total"]["value"], "\n")

for res in result["hits"]["hits"]:
    ngram = res["_source"]["ngram"]
    
    if not len(ngram.split(" ")) == 2:
        
        print("Dit zou niet horen \n", ngram)
        
print("Als er niets behalve de hits staat, dan is het goed")