In [None]:
# imports
import os
import time
import datetime
import csv
import re

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from elasticsearch import Elasticsearch, helpers
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Elastic host
es = Elasticsearch(
    hosts=[
            "https://localhost:9200"
    ],
    http_auth=("elastic", "NES9DZ-QwhanXAQf9caV"),
#     use_ssl=True,
    verify_certs=False,
#     ca_certs="./ca.crt"
)

# dir
translated_csv_dir = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Data_Userlines_CSV"
ngram_csv_dir = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Data_Ngram"

# word counts df
word_count_csv = 'C:/Users/Asher/Documents/School/_Scriptie/Data/xml_word_counts.csv'

df_word_count = pd.read_csv(word_count_csv)

display(df_word_count)

In [None]:
# return list met dict [{Land1}, {Land 1}]
def get_csv_files(country_selection):
    os.chdir(translated_csv_dir)
    
    country_return_list = []

    # loop door alle folders die hierboven zijn geprint
    for country in os.listdir():
        
        # filter op specifiek land (IN BOX 2)
        if country in country_selection:
            paths_dict = {}

            # ga door alle inhoud van de landfolder heen
            for root, dirs, files in os.walk(country):
                file_data = []
                
                # loop door files van een folder
                for file in files:
                    
                    file_data.append(file)

                paths_dict[root] = file_data

            country_return_list.append(paths_dict)
        
    return country_return_list

In [None]:
# haal jaar maand en dag uit filename
def extract_file_date(file_name):
    
    year_month_day = re.search(r"\d{4}-\d{2}-\d{2}", file_name)
#     year, month, day = year_month_day[0].split("-")
    
    return year_month_day[0]

### Shingles uit CSV

In [None]:
# geeft een dict met dates als keys en files als vals
def get_files_per_date(country, after=None):
    files = get_csv_files(country)[0][country]
    
    dates_dict = {}
    
    if after is not None:
        
        after = time.mktime(datetime.datetime.strptime(after, "%Y-%m-%d").timetuple())
    
    # loop door alle files heen
    for file in files:
        
        date = extract_file_date(file)
        
        # filter al geuploade dates uit de dict
        if after is not None:
            timestamp = time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple())
            
            if (timestamp - after) <= 0:
                
                continue
        
        if date in dates_dict.keys():
            
            dates_dict[date] += [file]
            
        else:
            
            dates_dict[date] = [file]
            
    return dates_dict

In [None]:
# maakt paths aan voor alle files die bij een date horen
def get_date_paths(root, files):
    os.chdir(translated_csv_dir)
    
    paths = []
    
    for file in files:
        paths.append(os.path.join(root, file))
        
    return paths

In [None]:
# maakt ngrams van alle tekst in een csv file
def get_date_vocabulary(paths):
    os.chdir(translated_csv_dir)
    
    vocabulary = np.array(())
    
    for path in paths:
        df = pd.read_csv(path)
    
        # slecht vertaalde notebooks met alleen een index negeren
        if len(df.index) == 0:
            continue
            
        lines = df["value"].to_list()
    
        victor = CountVectorizer(ngram_range=(1, 5), token_pattern=r"(?u)\b\w+\b")
        victor.fit_transform(lines)
        
        vocabulary = np.unique(np.hstack((vocabulary, victor.get_feature_names_out()))) 
    
    return vocabulary

In [None]:
def get_date_counts(paths, vocabulary):
    os.chdir(translated_csv_dir)
    
    counts = np.zeros(shape=(len(vocabulary)), dtype=int)
    
    for path in paths:
        df = pd.read_csv(path)
    
        # slecht vertaalde notebooks met alleen een index negeren
        if len(df.index) == 0:
            continue
            
        lines = df["value"].to_list()
    
        victor = CountVectorizer(ngram_range=(1, 5), token_pattern=r"(?u)\b\w+\b")
        victor.fit_transform(vocabulary)
        
        counts = counts + np.sum(victor.transform(lines).toarray(), axis=0)
    
    return counts

In [None]:
def get_es_date_dict(vocabulary, country, date, counts, elastic_id, index="ngrams"):
    word_total = df_word_count[(df_word_count["date"] == date) & (df_word_count["country"] == country)]["words"].values[0]
    year, month, day = re.search(r"\d{4}-\d{2}-\d{2}", date)[0].split("-")
#     word_total = 32 # voor "test" dataset
    bulk = []
    
    for ngram, count in zip(vocabulary, counts):
        content_dict, es_content_dict = {}, {}
        
        # start altijd op 1 (geef laatst ingevoerde id als input)
        elastic_id += 1
        
        # entry inhoud
        content_dict["ngram"] = ngram
        content_dict["country"] = country
        content_dict["year"] = year
        content_dict["month"] = month
        content_dict["day"] = day
        content_dict["count"] = count
        content_dict["percentage"] = round(((count * len(ngram.split(" "))) / word_total) * 100, 4)
        
        # entry technische dingen
        es_content_dict["_index"] = index
        es_content_dict["_id"] = elastic_id
        es_content_dict["_source"] = content_dict
        
        # voeg entry toe aan bulk return lijst
        bulk.append(es_content_dict)
        
    return bulk, elastic_id

In [None]:
# # functie testruimte
# test_dates_dict = get_files_per_date("test")
# print(test_dates_dict)
# test_paths = get_date_paths("test", test_dates_dict['2014-04-16'])
# print(test_paths)
# test_vocabulary = get_date_vocabulary(test_paths)
# print(test_vocabulary)
# test_counts = get_date_counts(test_paths, test_vocabulary)
# print(test_counts)
# test_date_es_dict = get_es_date_dict(test_vocabulary, "test", '2014-04-16', test_counts, 0)
# print(test_date_es_dict)

### Countvectorizer 1.3
Elastic (index = ngrams):
- | Shingle | Land | Jaar | Maand | Dag | Aantal | Percentage |
 - Shingles doen voor een datum, voor een land
 - Percentage berekenen
 - Uploaden in ES

In [None]:
def upload_ngrams(countries, elastic_id=0, after=None):
    
    for country in countries:
        
        # ga verder bij een specifieke datum vanaf een specifiek id
        if country == countries[0]:
            
            dates = get_files_per_date(country, after=after)
            
        else:
            
            dates = get_files_per_date(country)
        
        # dingen voor overzicht in prints
        timer = time.time()
        total_dates = len(dates.keys())
        processed_dates = 0
        
        for date in dates:
            processed_dates += 1
            dates_percentage = round((processed_dates / total_dates) * 100, 2)
            
            paths = get_date_paths(country, dates[date])
            
            vocabulary = get_date_vocabulary(paths)
            counts = get_date_counts(paths, vocabulary)
            
            elastic_dict, elastic_id = get_es_date_dict(vocabulary, country, date, counts, elastic_id)
            
            helpers.bulk(es, elastic_dict)
            print(f"Progress: {round((time.time() - timer) / 60, 2)}m, Uploaded: {date} {country} ({dates_percentage}%), Last Id: {elastic_id}")
        
        print(f"Progress: {country} finished")
        
    return True

In [None]:
# werkende landen
# uploaded = []

translated_countries = ["BG", "CZ", "DK", "NL", "SI", "GB"]

upload = upload_ngrams(translated_countries) #, 662485289, "2019-01-15"

print(upload)

In [None]:
# Progress: 93.87m, Uploaded: 2020-07-31 BG (100.0%), Last Id: 64824684

# Progress: 234.16m, Uploaded: 2021-04-01 CZ (100.0%), Last Id: 144041510

# Progress: 54.36m, Uploaded: 2017-01-11 DK (37.93%), Last Id: 179242678
# Progress: 54.46m, Uploaded: 2017-01-12 DK (38.08%), Last Id: 179311571
# Progress: 81.62m, Uploaded: 2020-09-29 DK (100.0%), Last Id: 232731837

# Progress: 303.28m, Uploaded: 2019-03-13 NL (73.06%), Last Id: 344101599
# Progress: 109.18m, Uploaded: 2020-11-03 NL (100.0%), Last Id: 385908154

# Progress: 32.42m, Uploaded: 2016-03-29 SI (28.25%), Last Id: 406585230
# Progress: 53.01m, Uploaded: 2018-12-17 SI (66.55%), Last Id: 439749779
# Progress: 27.21m, Uploaded: 2020-07-16 SI (100.0%), Last Id: 457496522

# Progress: 180.32m, Uploaded: 2017-01-24 GB (33.61%), Last Id: 563223404
# Progress: 172.62m, Uploaded: 2019-01-15 GB (46.53%), Last Id: 662485289

# TransportError: TransportError(429, 'circuit_breaking_exception', '[parent] Data too large, data for [<http_request>] would be [512818540/489mb], which is larger than the limit of [510027366/486.3mb], real usage: [512657568/488.9mb], new bytes reserved: [160972/157.1kb], usages [model_inference=0/0b, inflight_requests=160972/157.1kb, request=0/0b, fielddata=0/0b, eql_sequence=0/0b]')


### Countvectorizer 1.2

In [None]:
# # geef true als een bestand bestaat
# def get_ngram_exists(country, date):
#     os.chdir(ngram_csv_dir)
    
#     path = os.path.join(country, (date + ".csv"))
    
#     return os.path.isfile(path)

In [None]:
# # geeft woorden en hoevaak deze voorkomen
# def get_victor_from_csv(path, vocab=None, ngrams=5):
#     os.chdir(translated_csv_dir)
    
#     df = pd.read_csv(path)
    
#     # slecht vertaalde notebooks met alleen een index negeren
#     if len(df.index) == 0:
#         return np.array(()), np.array(())
    
#     lines = df["value"].to_list()
    
#     victor = CountVectorizer(ngram_range=(1, ngrams))
    
#     if vocab is not None:
        
#         victor.fit_transform(vocab)
#         counts = victor.transform(lines).toarray()
#         print("inserted vocac")
        
#     else:
    
#         counts = victor.fit_transform(lines).toarray()
    
#     # vocab is een andere mogelijkheid?
# #     headings = victor.vocabulary_
#     headings = victor.get_feature_names_out()
    
#     return headings, counts

In [None]:
# functie testruimte
# df_for_the_test = os.path.join("NL", "ParlaMint-NL_2014-04-16-tweedekamer-2.csv")
# df_for_the_test = os.path.join("NL", "ParlaMint-NL_2014-12-18-tweedekamer-25.csv")

# victor_output_headings, victor_output_counts = get_victor_from_csv(df_for_the_test)
# print(len(victor_output_headings))
# print(type(victor_output_counts))
# count_vect_df = pd.DataFrame(victor_output_counts, columns=victor_output_headings)

# display(count_vect_df)

In [None]:
# def create_vocabulary(country):
#     os.chdir(translated_csv_dir)
    
#     dates_dict = get_files_per_date(country)
    
#     # dingen voor overzicht in prints
#     timer = time.time()
#     total_dates = len(dates_dict.keys())
#     processed_dates = 0
    
#     vocabulary = np.array(())
    
#     # ga elke date langs
#     for date, files in dates_dict.items():
#         processed_dates += 1
#         dates_percentage = round((processed_dates / total_dates) * 100, 2)
        
#         # ga elke file langs per date
#         for file in files:
#             path = os.path.join(country, file)

#             headings, count = get_victor_from_csv(path, None, 5)
            
#             vocabulary = np.unique(np.hstack((vocabulary, headings)))
            
#         print(f"Progress: {round((time.time() - timer) / 60, 2)}m, Uploaded: {date} ({dates_percentage}%), Size: {len(vocabulary)}")
#     return vocabulary

In [None]:
# # functie testruimte
# real_input = "NL"

# test_vocabulary = create_vocabulary(real_input)

# print(len(test_vocabulary))

In [None]:
# def create_shingle_testing(country):
#     os.chdir(translated_csv_dir)
    
#     dates_dict = get_files_per_date(country)
    
#     # dingen voor overzicht in prints
#     timer = time.time()
    
#     longheadings = np.array(['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj'])
    
#     for file in dates_dict["2014-04-16"]:
#         path = os.path.join(country, file)
        
#         headings, count = get_victor_from_csv(path, None, 1)

#         print(headings, np.sum(count, axis=0))

#     print(round((time.time() - timer) / 60, 2))

In [None]:
# # functie testruimte
# real_input = "test"

# create_shingle_testing(real_input)

### Countvectorizer 1.1

In [None]:
# # maak een csv waar per date alle shingles in staan
# def create_shingle_csv(file, headings, counts):
#     os.chdir(ngram_csv_dir)
    
#     with open(file, 'w', newline='', encoding="utf-8") as f:
#         w = csv.writer(f)
#         w.writerow(headings)
        
#         if len(counts) == 1:
        
#             w.writerow(counts[0])
            
#         else:
            
#             w.writerow(counts)

In [None]:
# # oude funtie (np.sum(x, axis=0) doet hetzelfde)
# def get_rows_merged(rows):
#     return_rows = np.zeros(shape=(1,len(rows[0])), dtype=int)
        
#     # merge de rijen tot een enkele rij
#     if len(rows) != 1:

#         for row in rows:
#             return_rows = return_rows + row

#     else:

#         return_rows = rows
        
#     return return_rows

In [None]:
# # maakt een csv file aan per datum met shingles
# def create_shingle_per_day(country):
#     os.chdir(translated_csv_dir)
    
#     dates_dict = get_files_per_date(country)
    
#     # dingen voor overzicht in prints
#     timer = time.time()
#     total_dates = len(dates_dict.keys())
#     processed_dates = 0
    
#     # ga elke date langs
#     for date, files in dates_dict.items():
#         processed_dates += 1
#         dates_percentage = round((processed_dates / total_dates) * 100, 2)
        
#         if not get_ngram_exists(country, date):
        
#             # ga elke file langs per date
#             for iteration, file in enumerate(files):

#                 path = os.path.join(country, file)

#                 # basis lijst neerzetten bij eerste iteratie
#                 if iteration == 0:

#                     ngrams, counts = get_victor_from_csv(path)
#                     counts = get_rows_merged(counts)[0]
                
#                 else:

#                     new_ngrams, new_counts = get_victor_from_csv(path)

#                     # loop de ngram en counts
#                     for ngram, count in zip(new_ngrams, get_rows_merged(new_counts)[0]):

#                         # verander het ngram nummer als die al bestaat
#                         if ngram in ngrams:

#                             position = counts[np.where(ngrams == ngram)[0][0]]
#                             position = position + count

#                         # voeg de ngram toe aan ngrams en counts als die er nog niet is
#                         else:

#                             ngrams = np.append(ngrams, ngram)
#                             counts = np.append(counts, count)
            
#             create_shingle_csv(os.path.join(country, (date + ".csv")), ngrams, counts)
#             print(f"Progress: {round((time.time() - timer) / 60, 2)}m, Uploaded: {date} ({dates_percentage}%)")
            
#         else:

#             print(f"Skipping: {date} ({dates_percentage}%)")

In [None]:
# # functie testruimte
# real_input = "NL"

# tngrams, tcounts = create_shingle_per_day(real_input)

### Countvectorizer 1.0

In [None]:
# NOTE: Te langzame functie
# geeft een merged vectorizer terug
# def get_merged_victor(names1, vals1, names2, vals2):
#     names = np.unique(np.append(names1, names2))
#     vals = np.zeros(shape=(1,len(names)), dtype=int)
    
#     # voegt waarden toe aan vals
#     def merge_vals(names_to_merge, vals_to_merge):
#         m_vals_to_merge = np.zeros(shape=(1,len(vals_to_merge[0])), dtype=int)
        
#         # merge de rijen tot een enkele rij
#         if len(vals_to_merge) != 1:
            
#             for row in vals_to_merge:
#                 m_vals_to_merge = m_vals_to_merge + row
                
#         else:
            
#             m_vals_to_merge = vals_to_merge
        
#         # voeg de waarden toe aan de return set
#         for iteration, name in enumerate(names_to_merge):
#             pos = np.where(names == name)[0][0]
#             count = m_vals_to_merge[0][iteration]
            
#             vals[0][pos] = vals[0][pos] + count
    
#     merge_vals(names1, vals1)
#     merge_vals(names2, vals2)
    
#     return names, vals

### Oude code met Elastic

In [None]:
# # Elastic host
# es = Elasticsearch(
#     hosts=[
#             "https://localhost:9200"
#     ],
#     http_auth=("elastic", "NES9DZ-QwhanXAQf9caV"), #basic_auth werkt niet met dsl queries
# #     use_ssl=True,
# #     verify_certs=False,
#     ca_certs="./ca.crt"
# )

In [None]:
# # translated countries
# translated_countries = ["BG", "CZ", "DK", "NL", "SI"]

# # word counts df
# word_count_csv = 'C:/Users/Asher/Documents/School/_Scriptie/Data/xml_word_counts.csv'

# df_word_count = pd.read_csv(word_count_csv)

# # display(df_word_count)

In [None]:
# # NOTE: input is {search_key: value, search_key2: value2}
# # query function
# def query(search_dict):
#     processed_search_list = []
    
#     # loop door alle search elements heen
#     for k, v in search_dict.items():
#         processed_search_list.append({"match_phrase" : {k : v}})
        
#     # stel de uitkomst samen
#     result = es.search(
#     index = "search",
#     size = 10000, # TODO: Zorg dat er een groter limit is dan 10000
#     query = {
#         "bool" : {
#             "must": processed_search_list,
#         },
#     })
    
#     return result

In [None]:
# # https://copyprogramming.com/howto/using-shingles-and-fuzziness-in-elasticsearch-python-dsl
# def make_dsl_query(fields):
#     """
#     Construct a query
#     """
#     my_query = Search(using=es, index="search")
#     if fields['country'] and fields['content_translated']:
#         my_query = my_query.query(Q('bool', should=
#                    [Q("match", name__shingles=fields['country']),
#                     Q("match", surname__shingles=fields['content_translated'])]))
#     return my_query

# # if __name__ == '__main__':
# my_query = make_dsl_query(fields={"country": "NL", "content_translated": "bill"})
# response = my_query.execute()
# print(response)
# for hit in response:
#     print(hit.meta.score, hit.name, hit.surname)
#     break

In [None]:
# # NOTE: input is {search_key: value, search_key2: value2}
# # query function
# def query_new(search_dict):
#     processed_search_list = []
    
# #     search_dict["analyser"] = "standard_shingle"
    
#     # loop door alle search elements heen
#     for k, v in search_dict.items():
#         processed_search_list.append({"match_phrase" : {k : v}})
        
#     # stel de uitkomst samen
#     result = es.search(
#     index = "search",
#     size = 10000, # TODO: Zorg dat er een groter limit is dan 10000
#     query = {
#         "bool" : {
#             "must": processed_search_list,
#         },
#     })
    
#     return result

# result = query_new({"country": "NL", "content_translated": "bill"})

# print(result["hits"]["hits"])

### Query testing

In [None]:
# # TODO: debate uploads opniew runnen om andere vertaalde landen ook toe te voegen

# countries_returned = {}

# for ctr in translated_countries:
#     query_dit = query({"content_translated":"bill", "country":ctr})
    
#     for line in query_dit["hits"]["hits"]:

#         ct = line["_source"]["country"]

#         if ct in countries_returned.keys():

#             countries_returned[ct] += 1

#         else:

#             countries_returned[ct] = 1
    
# print(countries_returned)