# Installing the requirements

In [48]:
!pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Imports

In [49]:
import random
from keybert import KeyBERT
from multi_rake import Rake
import yake
import os
import collections
import json
# import num2words
import re
import datetime


# Setups

## Keyword Extractors

In [50]:
dt = datetime.datetime.now()
timestamp = dt.strftime('%Y_%m_%d_%H:%M:%S')[:-3]

kw_model = KeyBERT(model='all-mpnet-base-v2')


def kw_yake(text):
    # kw_extractor = yake.KeywordExtractor()
    language = "en"
    max_ngram_size = 1
    deduplication_threshold = 0.9
    numOfKeywords = 10
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                                top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    keywords_str = ""
    for index, keyword in enumerate(keywords):
        if index == len(keywords) - 1:
            keywords_str += f"{keyword[0]}"
        else:
            keywords_str += f"{keyword[0]} "
    return keywords_str


def kw_yake_detailed(text):
    language = "en"
    max_ngram_size = 5
    deduplication_threshold = 0.0
    numOfKeywords = 10
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                                top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    keywords_str = ""
    for index, keyword in enumerate(keywords):
        if index == len(keywords) - 1:
            keywords_str += f"{keyword[0]}"
        else:
            keywords_str += f"{keyword[0]} "
    return keywords_str


def kw_rake(text):
    rake = Rake()
    keywords = rake.apply(text)
    keywords_str = ""
    for index, keyword in enumerate(keywords):
        if index == len(keywords) - 1:
            keywords_str += f"{keyword[0]}"
        else:
            keywords_str += f"{keyword[0]} "
    return (keywords_str)


def kw_keybert(text):
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 7),
        stop_words='english',
        highlight=False,
        top_n=1)
    keywords_list = list(dict(keywords).keys())
    if not keywords_list:
        return ""
    return (keywords_list[0])


def keyword_extractors(text):
    keywords = ({
        "KeyBERT": kw_keybert(text),
        "RAKE": kw_rake(text),
        "YAKE": kw_yake(text),
        "YAKE (Detailed)": kw_yake_detailed(text),
    })
    return keywords

## Decontraction Function and slicer

In [51]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    phrase = re.sub(r"Won\'t", "Will not", phrase)
    phrase = re.sub(r"Can\'t", "Can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

list_negations = [
    " not ",
    " not.",
    " no ",
    " no."
]

def slicer(my_str, sub):
    index = my_str.find(sub)
    if index != -1:
        return my_str[index + len(sub):]
    else:
        raise Exception('Sub string not found!')

## Folder Paths and reading the raw files

In [52]:
folderpath = r"./txt_conversations"  # make sure to put the 'r' in front
folderpath_stripped = folderpath.replace("./","")

filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]
all_files = []

for path in filepaths:
    print(path)
    try:
        with open(path, 'r') as f:
            file = f.readlines()
            all_files.append(file)
    except:
        print("EXCEPTION" + path)
        continue

files_as_text = []
for elements in all_files:
    text = ""
    for lines in elements:
        try:
            text += lines
        except:
            print("Exception in all files")
            continue
    files_as_text.append(text)

with open('./output_files/all_text.json', mode='w', encoding='utf-8') as feedsjson2:
    json.dump(all_files, feedsjson2)

with open('./output_files/all_text_array.json', mode='w', encoding='utf-8') as feedsjson3:
    json.dump(files_as_text, feedsjson3)


./txt_conversations/1. Visiting a Travel Agent.txt
./txt_conversations/1. I Feel Like Chinese.txt
./txt_conversations/1. Making Plans for the Weekend.txt
./txt_conversations/1. I Go to College.txt
./txt_conversations/1. Looking for a Job.txt
./txt_conversations/1. Too Much Crime.txt
./txt_conversations/1. Which Bus to Take (1).txt
./txt_conversations/1. I Live in Pasadena.txt
./txt_conversations/1. What Will People Think.txt
./txt_conversations/1. Unemployment Insurance.txt


## Declaring variables

In [53]:
feeds = collections.defaultdict(list)
feeds2 = []
current = ""
files_total = 0
lines_total = 0
skipped = 0
not_skipped_answers = 0
skipped_too_long = 0
skipped_too_long_cache = 0
not_skipped_questions = 0
empty_line = 0
keywords_counter = ({
        "KeyBERT": 0,
        "RAKE": 0,
        "YAKE": 0,
        'YAKE_DETAILED': 0
    })

history_counter = ({
        "0": 0,
        "1": 0,
        "2": 0,
        '3': 0
    })

length = len(all_files)
counter_1 = 0

# Iterating through all files and create the dataset structure

In [54]:
for fp in all_files:
    counter_1 += 1
    print(f"______________## {(counter_1/length)*100} % ##______________")
    http_in_file = False
    for lines in fp:
        if "http" in lines:
            print("+++ HTTP IN LINE +++")
            http_in_file = True
    files_total += 1
    current = ""
    lines = []
    counter = 0
    cache = ""
    cache_happy_transformers = ""
    cache_array = []
    http_here = False

    for i, line in enumerate(fp):
        ## ALTERNATIVE 1 ##
        # For all conversations where the history is longer than 900 chars, the conversation is skipped
        # if len(cache) > 900:
        #     skipped_too_long_cache +=1
        #     continue
        ## ALTERNATIVE 1 ##


        ## ALTERNATIVE 2 ##
        # It is randomly chosen, how long the appended history in a conversation is
        # It is never longer than 3 QA-Pairs
        if len(cache_array) == 1:
            pop_random = random.randint(1, 5)
            if pop_random == 1:
                cache_array.clear()

        if len(cache_array) == 2:
            pop_random = random.randint(0, 5)
            if pop_random == 1:
                cache_array.pop(0)
            elif pop_random == 2:
                cache_array.pop(0)
                cache_array.pop(0)
            elif pop_random == 3:
                cache_array.clear()

        if len(cache_array) > 3:
            pop_random = random.randint(1, 3)
            if pop_random == 1:
                cache_array.pop(0)
            elif pop_random == 2:
                cache_array.pop(0)
                cache_array.pop(0)
            elif pop_random == 3:
                cache_array.clear()
        ## ALTERNATIVE 2 ##

        lines_total += 1

        # Sometimes there are HTTP links or empty lines in the conversations -> Those are skipped
        if http_in_file:
            if line == "" or line == "\n" or line == "\r" or http_here is False:
                if "http" in line:
                    http_here = True
                print(f"+++ Empty Line found in HTTP  {fp} +++")
                empty_line += 1
                continue
        elif line == "" or line == "\n" or line == "\r" or ": " not in line:
            print(f"+++ Empty Line found in {fp} +++")
            empty_line += 1
            continue
        else:
            line = slicer(line.strip(), ": ")

        # If the random number == 3 -> The conversation is skipped
        # Setting the interval between 0 and 2 leads to three different trainingfiles equal in size
        random_number = random.randint(0, 2)
        if random_number != 3:
            skip = random_number
        else:
            skipped += 1

        line = line.replace("\"", "").replace("’", "'").replace("–","-").replace("“", "\"").replace(" ", " ").replace("“", "\n").replace("é","é").replace("  ", " ").replace("…", "...").replace("‘", "'").replace("é","é")

        # Handling Questions and Answers in the Conversation file
        if (counter % 2) == 0 and not current == "question":
            # Saving the question to append it later on
            not_skipped_questions += 1
            question = line
            current = "question"
        elif current == "question":
            ## If a "?" is at the first answer, every second conversation will be skipped ##
            if counter == 1 and "?" in line:
                print("??? Question in the first line ???")
                if random.randint(0,1):
                    print("??? SKIPPED ???")
                    break
            ## If a "?" is at the first answer, every second conversation will be skipped ##

            current = "answer"
            answer = line

            # If the answer is too long, it will be skipped
            if len(answer) > 110:
                skipped_too_long += 1
                skipped += 1
                skip = 3

            # GPT3 Format ##
            if not skip == 3:
                not_skipped_answers += 1
                answer_prepared = decontracted(answer)
                random_keyword_generator = random.randint(1, 4)
                keywords = ""

                # Comment out one of the following alternatives. Only one can be active at a time.
                ## Alternative RND: Random Keyword Model Chosen ##
                if random_keyword_generator == 1:
                    keywords = kw_keybert(answer_prepared)
                    keywords_counter['KeyBERT'] += 1
                elif random_keyword_generator == 2:
                    keywords = kw_rake(answer_prepared)
                    keywords_counter['RAKE'] += 1
                elif random_keyword_generator == 3:
                    keywords = kw_yake(answer_prepared)
                    keywords_counter['YAKE'] += 1
                else:
                    keywords = kw_yake_detailed(answer_prepared)
                    keywords_counter['YAKE_DETAILED'] += 1
                ## Alternative RND: Random Keyword Model Chosen ##

                ## Alternative CR: Choose the shortest keywords ##
                # for x in range(4):
                #     if x == 0:
                #         keywords_attempt = kw_keybert(answer_prepared)
                #         if len(keywords_attempt) < len(keywords) or len(keywords) == 0:
                #             print(f"+++ KW SHORTENED +++ (WAS: {keywords} IS NOW: {keywords_attempt}")
                #             keywords = keywords_attempt
                #     elif x == 1:
                #         keywords_attempt = kw_rake(answer_prepared)
                #         if len(keywords_attempt) < len(keywords) or len(keywords) == 0:
                #             print(f"+++ KW SHORTENED +++ (WAS: {keywords} IS NOW: {keywords_attempt}")
                #             keywords = keywords_attempt
                #     elif x == 2:
                #         keywords_attempt = kw_yake(answer_prepared)
                #         if len(keywords_attempt) < len(keywords) or len(keywords) == 0:
                #             print(f"+++ KW SHORTENED +++ (WAS: {keywords} IS NOW: {keywords_attempt}")
                #             keywords = keywords_attempt
                #     else:
                #         keywords_attempt = kw_yake_detailed(answer_prepared)
                #         if len(keywords_attempt) < len(keywords) or len(keywords) == 0:
                #             print(f"+++ KW SHORTENED +++ (WAS: {keywords} IS NOW: {keywords_attempt}")
                #             keywords = keywords_attempt
                ## Alternative CR: Choose the shortest keywords ##

                # If the random chosen keyword extractor of alternative 1 did not find a keyword, the other extractors try to find one
                if keywords == "":
                    print("+++ Keyword Extraction -> Empty Result +++")
                    for x in range(4):
                        if keywords:
                            print(f"+++ KW FOUND +++ (By another extraxtor)")
                            break
                        if x == 0:
                            keywords = kw_keybert(answer_prepared)
                        elif x == 1:
                            keywords = kw_rake(answer_prepared)
                        elif x == 2:
                            keywords = kw_yake(answer_prepared)
                        else:
                            keywords = kw_yake_detailed(answer_prepared)
                if keywords == "":
                    print(f"+++ KW still not found +++ (No KWs found at all)")
                    skip = 3


                if not skip == 3:
                ## Check if the answer contains a negation
                    for negated_word in list_negations:
                        if negated_word in answer_prepared.lower():
                            # Check if the negation is present in the keywords
                            if negated_word not in keywords.lower():
                                ## Check if the negation is rather in the beginning or the end and insert it in the keywords
                                if answer_prepared.find(negated_word) < (len(answer_prepared) - len(negated_word))/2:
                                    keywords = negated_word.replace(" ", "").replace(".", "") + " " + keywords
                                else:
                                    keywords = keywords + " " +  negated_word.replace(" ", "").replace(".", "")

                    keywords_all = keyword_extractors(answer_prepared)

                    filename = f"./output_files/{timestamp}_part_{skip}_{folderpath_stripped}_training.json"
                    with open(filename, mode='w', encoding='utf-8') as feedsjson:
                        ## EVERY SNIPPET AS QA PAIR ONLY WITHOUT HISTORY ##
                        # entry = {'prompt': f"Question: {question}\nKeywords: {keywords}\nAnswer:\n\n###\n\n", 'completion': f"{answer} END" }
                        # feeds.append(entry)
                        # json.dump(feeds, feedsjson)
                        ## EVERY SNIPPET AS QA PAIR ONLY WITHOUT HISTORY ##

                        ## ALTERNATIVE WITH CACHE ARRAY ##
                        history = ""
                        for entries in cache_array:
                            history += entries
                        history_counter[str(len(cache_array))] += 1
                        entry2 = {'prompt': f"{history}Question: {question}\nKeywords: {keywords}\nAnswer:\n\n###\n\n",
                                  'completion': f"{answer} END"}
                        ## ALTERNATIVE WITH CACHE ARRAY ##

                        ## ALTERNATIVE W/O CACHE ARRAY ##
                        # entry2 = {'prompt': f"{cache}Question: {question}\nKeywords: {keywords}\nAnswer:\n\n###\n\n",
                        #           'completion': f"{answer} END"}
                        ## ALTERNATIVE W/O CACHE ARRAY ##

                        # Appending different arrays to have separate training files
                        feeds['all'].append(entry2)
                        feeds[skip].append(entry2)
                        json.dump(feeds[skip], feedsjson)

                    filename = f"./output_files/{timestamp}_all_{folderpath_stripped}_training.json"
                    with open(filename, mode='w', encoding='utf-8') as feedsjson3:
                        json.dump(feeds['all'], feedsjson3)

                    ## Save keywords to compare the methods later on
                    with open('./output_files/keyword_examples.json', mode='w', encoding='utf-8') as feedsjson2:
                        entry2 = {'sentence': f"{answer}", 'keywords': keywords_all}
                        feeds2.append(entry2)

                        # For the whole conversation ##
                        json.dump(feeds2, feedsjson2)
            cache += f"Question: {question}\nAnswer: {answer}\n"
            cache_array.append(f"Question: {question}\nAnswer: {answer}\n")
        else:
            print("+++ Skipped the whole file because order was mixed +++")
            print("Current: "+ current +" line that made it skip: Number: " + str(counter) + line)
            break
        counter += 1



______________## 10.0 % ##______________
+++ Empty Line found in ['\n', 'A: Thank you for coming to our travel agency. How can I help you?\n', 'B: I would like to book a trip to Disney World for my family.\n', 'A: Your family will have much fun. When do you want to go?\n', 'B: We want to go during the summer when the kids have off from school.\n', "A: I suggest early June because it won't rain too much and the park isn't as crowded as later in the season.\n", "B: Great! let's do the second week in June.\n", 'A: How many adults and how many children will be travelling?\n', 'B: There are two adults and two children.\n', 'A: Do you have an airline preference?\n', 'B: No. I have no preference as long as the flight is non-stop.\n', 'A: Do you want a budget hotel or a luxury hotel?\n', 'B: Can you book something in between?\n', '\n', '\n'] +++
+++ Empty Line found in ['\n', 'A: Thank you for coming to our travel agency. How can I help you?\n', 'B: I would like to book a trip to Disney World 

# Printing some stats

In [55]:
print("Files in total: " + str(files_total))
print("Lines in total : " + str(lines_total))
print("Empty Lines in total : " + str(empty_line))
print("Skipped too long cache : " + str(skipped_too_long_cache))
print("Skipped: " + str(skipped))
print("Skipped Too long: " + str(skipped_too_long))
print("Not Skipped Answers : " + str(not_skipped_answers))
print("Not Skipped Questions: " + str(not_skipped_questions))
print("Keywords counter: " + str(keywords_counter))
print("History counter: " + str(history_counter))
for all_elements in (feeds):
    print(f"Length of training split {all_elements}: {len(feeds[all_elements])}")

Files in total: 10
Lines in total : 142
Empty Lines in total : 28
Skipped too long cache : 0
Skipped: 0
Skipped Too long: 0
Not Skipped Answers : 56
Not Skipped Questions: 57
Keywords counter: {'KeyBERT': 8, 'RAKE': 12, 'YAKE': 17, 'YAKE_DETAILED': 19}
History counter: {'0': 19, '1': 18, '2': 8, '3': 11}
Length of training split all: 56
Length of training split 0: 17
Length of training split 2: 23
Length of training split 1: 16
