In [None]:
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.express as px

from elasticsearch import Elasticsearch

In [None]:
# Elastic host
es = Elasticsearch(
    hosts=[
            "https://localhost:9200"
    ],
    http_auth=("elastic", "NES9DZ-QwhanXAQf9caV"),
#     use_ssl=True,
    verify_certs=False,
#     ca_certs="./ca.crt"
)

# word counts df
word_count_csv = 'C:/Users/Asher/Documents/School/_Scriptie/Data/xml_word_counts.csv'

df_word_count = pd.read_csv(word_count_csv)

display(df_word_count)

# total ngram entries: 771,039,961

In [None]:
# query ngram function (summarize=None, year.keyword, month.keyword)
def query_ngram(search_dict, summarize=None):
    processed_search_list = []
    
    # loop door alle search elements heen
    for k, v in search_dict.items():
        
        # filter de ngram voor de term query
        if k is not "ngram":
            processed_search_list.append({"match_phrase" : {k : v}})
            
        else:
            ngram = {"term": {"ngram.keyword" : v}}
    
    # check of er een groupby sum query moet worden gedaan
    if summarize != None:
        
        result = es.search(
        index = "ngrams",
        size = 0, # TODO: Zorg dat er een groter limit is dan 10000
        query = {
            "bool": {
                "must": processed_search_list,
                "filter": ngram
            }
        },
        aggs = {
            "categories": {
                "multi_terms": {
                    "terms": [{
                        "field": summarize,
                        
                    }, {
                        "field": "country.keyword",
                        
                    }],
                    "size": 1000
                },
                "aggs": {
                    "total_count": {
                        "sum": {
                            "field": "count"
                        }
                    } 
                }
            } 
        })
        
    else:
        
        result = es.search(
        index = "ngrams",
        size = 10000, # TODO: Zorg dat er een groter limit is dan 10000
        query = {
            "bool": {
                "must": processed_search_list,
                "filter": ngram
            }
        })
    
    return result

In [None]:
# geeft een ngram df
# NOTE: query["country"] = list(), normaal is dat een string
def create_ngram_2(query, timespan="year"):
    countries = ["BG", "CZ", "DK", "NL", "SI", "GB"]
    
    # aantal woorden van de ngram
    multiplier = len(query["ngram"].split(" "))
    
    # return dataframe
    df = pd.DataFrame({'date': [],
                       'percentage': [],
                       'country': []
                      })
    
    # kijk door welke landen er moet worden gezocht
    if "country" in query.keys():
        countries = query["country"]
        query.pop("country")
    
    # voeg result toe aan df en format timespan
    def append_to_df(date_format, timespan):
        results_total = 0
        month = "1"
        
        # kijk of er door summed values moet worden gezocht
        if timespan in ["year", "month"]:
            timespan = timespan + ".keyword"
        
            results = query_ngram(query, summarize=timespan)
            
            # loop door de samengevoegde data heen
            for result in results["aggregations"]["categories"]["buckets"]:
                country = result["key"][1]
                count = result["total_count"]["value"]
                
                results_total += count
                
                # output zetten voor maanden
                if timespan == "month.keyword":
                    year = query["year"]
                    month = result["key"][0]
                    total_counts = sum(df_word_count[(df_word_count["date"].str.contains(f"{year}-{month}")) & (df_word_count["country"] == country)]["words"])

                # output zetten voor jaren
                else:
                    year = result["key"][0]
                    total_counts = sum(df_word_count[(df_word_count["date"].str.contains(year)) & (df_word_count["country"] == country)]["words"])
                    
                date = date_format.format(year, month)
                percentage = round(((count * multiplier) / total_counts) * 100, 4)
                
                df.loc[len(df.index)] = [date, percentage, country]

            return int(results_total)
        
        else:
            results = query_ngram(query)
            
            # loop door de results van alle landen heen
            for result in results["hits"]["hits"]:
                src = result["_source"]

                count = src["count"] 
                
                results_total += count
                
                country = src["country"]
                date = date_format.format(src["year"], src["month"], src["day"])
                total_count = df_word_count[(df_word_count["date"] == date) & (df_word_count["country"] == country)]["words"].values[0]
                percentage = round(((count * multiplier) / total_count) * 100, 4)
                
                df.loc[len(df.index)] = [date, percentage, country]

            return results_total
    
    # timespan formatting
    if timespan == "year":
        res = append_to_df("{}-1-1", timespan)
        
    elif timespan == "month":
        res = append_to_df("{}-{}-1", timespan)
        
    else:
        res = append_to_df("{}-{}-{}", timespan)
    
    return df[df["country"].isin(countries)].sort_values(by=["date", "country"]), res

### Tests
- 1 result is dat het trefwoord 1x over de algehele dataset voorkomt

In [None]:
timer = time.time()

# instellingen
ngram_term = "if there"
timespan = "year"

ngram_input = {"ngram": ngram_term}

# maak df en plot
ngram_df, ngram_total = create_ngram_2(ngram_input, timespan=timespan)

fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
timer = time.time()

# instellingen
ngram_term = "if there"
timespan = "month"

ngram_input = {"ngram": ngram_term,
              "year": "2016"}

# maak df en plot
ngram_df, ngram_total = create_ngram_2(ngram_input, timespan=timespan)

fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
timer = time.time()

# instellingen
ngram_term = "if there"
timespan = "day"

ngram_input = {"ngram": ngram_term,
              "year": "2016",
              "month": "12"}

# maak df en plot
ngram_df, ngram_total = create_ngram_2(ngram_input, timespan=timespan)

fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

fig.show()

# timer
print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

### Oude functie

In [None]:
# import numpy as np

# # geeft een ngram df
# # NOTE: query["country"] = list(), normaal is dat een string
# def create_ngram(query, timespan="years"):
#     countries = ["BG", "CZ", "DK", "NL", "SI", "GB"]
    
#     # return dataframe
#     df = pd.DataFrame({'date': [],
#                        'percentage': [],
#                        'country': []
#                       })
    
#     # kijk door welke landen er moet worden gezocht
#     if "country" in query.keys():
#         countries = query["country"]
#         query.pop("country")
    
#     # voeg result toe aan df en format timespan
#     def append_to_df(date_format):
#         results_total = 0
    
#         # loop door de antwoorden per land
#         for country in countries:
#             query_country = query["country"] = country
            
#             results = query_ngram(query)
            
#             results_total = results_total + results["hits"]["total"]["value"]
            
#             for result in results["hits"]["hits"]:
#                 src = result["_source"]

#                 date = date_format.format(src["year"], src["month"], src["day"])

#                 df.loc[len(df.index)] = [date, src["count"], src["country"]]

#         return results_total
    
#     # timespan formatting
#     if timespan == "years":
#         res = append_to_df("{}-1-1")       
        
#     elif timespan == "months":
#         res = append_to_df("{}-{}-1")
        
#     else:
#         res = append_to_df("{}-{}-{}")
    
# #     display(df)
    
#     # combineer alle dates
#     df["count"] = df["country"]

#     df = df.groupby(["country", "date"]).agg({
#         'percentage' : np.sum,
#         "count": np.size
#     }).reset_index()

# #     df['percentage'] /= df['count']
#     df = df.drop(['count'], axis=1)
        
#     return df.sort_values(by=["date", "country"]), res

In [None]:
# timer = time.time()

# # instellingen
# ngram_term = "if there"
# timespan = "years"

# ngram_input = {"ngram": ngram_term}

# # maak df en plot
# ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)

# fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

# fig.show()

# # timer
# print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
# timer = time.time()

# # instellingen
# ngram_term = "if there"
# timespan = "months"

# ngram_input = {"ngram": ngram_term,
#               "year": "2014"}

# # maak df en plot
# ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)

# fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

# fig.show()

# # timer
# print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

In [None]:
# timer = time.time()

# # instellingen
# ngram_term = "if there"
# timespan = "days"

# ngram_input = {"ngram": ngram_term,
#               "year": "2014",
#               "month": "12"}

# # maak df en plot
# ngram_df, ngram_total = create_ngram(ngram_input, timespan=timespan)

# fig = px.line(ngram_df, x="date", y="percentage", color="country", title=f'Ngram: "{ngram_term}"')

# fig.show()

# # timer
# print(f"'{ngram_term}' took {round(time.time() - timer, 2)}s, for {ngram_total} results")

### Losse tests

In [None]:
# # keyword query test
# ngram_input = {"ngram" : "would have",
#                "country": "NL"}
# result = query_ngram(ngram_input)

# print("Hits: ", result["hits"]["total"]["value"], "\n")

# for res in result["hits"]["hits"]:
#     ngram = res["_source"]["ngram"]
    
#     if not len(ngram.split(" ")) == 2:
        
#         print("Dit zou niet horen \n", ngram)
        
# print("Als er niets behalve de hits staat, dan is het goed")