In [1]:
import pandas as pd
from keybert import KeyBERT
import os
from proj import dataloader

def csv_format_script(kw_lists):
    output_list = []
    for kw_list in kw_lists:
        probs = []
        words = []
        for kw_pair in kw_list:
            probs.append(kw_pair[1])
            words.append(kw_pair[0])
        
        words.extend(probs)
        output_list.append(words)
    return output_list
    

def add_titles(kw_lists, titles_list):
    output_list = []
    
    for kw_list, title in zip(kw_lists, titles_list):
        kw_list.insert(0, title)
        output_list.append(kw_list)
        
    return output_list


def converter_to_ps_script(kw_lists, titles_list):
    max_length = len(kw_lists[0])
    columns = ["title"]
    columns.extend(f"keyword{i}" for i in range(max_length))
    columns.extend(f"kw_prob{i}" for i in range(max_length))
    return pd.DataFrame(add_titles(csv_format_script(kw_lists), titles_list), columns=columns)

def merge_columns(df1, df2, on, common_columns):
    def combine(x):
        result = [*set(x.tolist())]
        result.extend([None for _ in range(
            len(x.tolist())-(len(result))
        )])
        return result
    
    merge_ex = pd.merge(df1, df2, "outer", on=on)
    common_columns = ["{}_x".format(column) for column in common_columns] + ["{}_y".format(column) for column in common_columns]
    merge_ex[common_columns] = merge_ex[common_columns].apply(combine, axis=1, result_type="expand")
    return merge_ex

dataset = dataloader.load_billsum(
    "local_datasets"
)

  from tqdm.autonotebook import tqdm, trange


# Cell below is to generate keywords from the dataset

In [17]:
keybert_ext = KeyBERT()

kw = {
    "train": {},
    "test": {}
}

kw["train"]["text"] = keybert_ext.extract_keywords([*map(lambda s: s.lower(), dataset["train"]["text"])])
kw["train"]["summary"] = keybert_ext.extract_keywords([*map(lambda s: s.lower(), dataset["train"]["summary"])])
kw["test"]["text"] = keybert_ext.extract_keywords([*map(lambda s: s.lower(), dataset["test"]["text"])])
kw["test"]["summary"] = keybert_ext.extract_keywords([*map(lambda s: s.lower(), dataset["test"]["summary"])])

kw["train"]["text"] = converter_to_ps_script(kw["train"]["text"], dataset["train"]["title"])
kw["train"]["summary"] = converter_to_ps_script(kw["train"]["summary"], dataset["train"]["title"])
kw["test"]["text"] = converter_to_ps_script(kw["test"]["text"], dataset["test"]["title"])
kw["test"]["summary"] = converter_to_ps_script(kw["test"]["summary"], dataset["test"]["title"])

# Cell below is to load the keywords

In [3]:
kw = {
    "train": {},
    "test": {}
}

kw["train"]["text"] = pd.read_csv("./.temp/keywords/billsum/train_text_kw.csv")
kw["train"]["summary"] = pd.read_csv("./.temp/keywords/billsum/train_sum_kw.csv")
kw["test"]["text"] = pd.read_csv("./.temp/keywords/billsum/test_text_kw.csv")
kw["test"]["summary"] = pd.read_csv("./.temp/keywords/billsum/test_sum_kw.csv")

# Drop the names and merge columns to generate a union of keywords
Length is variable

In [4]:
def name_generator():
    return ["kw_prob{}".format(i) for i in range(0, 5)]

kw["train"]["text"] = kw["train"]["text"].drop(name_generator(), axis=1)
kw["train"]["summary"] = kw["train"]["summary"].drop(name_generator(), axis=1)
kw["test"]["text"] = kw["test"]["text"].drop(name_generator(), axis=1)
kw["test"]["summary"] = kw["test"]["summary"].drop(name_generator(), axis=1)

merged_kw = {}

merged_kw["test"] = merge_columns(kw["test"]["summary"], kw["test"]["text"], "title", ["keyword{}".format(i) for i in range(0, 5)])
merged_kw["train"] = merge_columns(kw["train"]["summary"], kw["train"]["text"], "title", ["keyword{}".format(i) for i in range(0, 5)])

# Save the keywords alongside the original keywords

In [6]:
# Save the merged keywords exclusively

os.makedirs(".temp/", exist_ok=True)
os.makedirs(".temp/keywords/", exist_ok=True)

os.makedirs(".temp/keywords/billsum/", exist_ok=True)

kw["train"]["text"].to_csv(".temp/keywords/billsum/train_sum_kw.csv", index=False)
kw["train"]["summary"].to_csv(".temp/keywords/billsum/train_text_kw.csv", index=False)
kw["test"]["text"].to_csv(".temp/keywords/billsum/test_text_kw.csv", index=False)
kw["test"]["summary"].to_csv(".temp/keywords/billsum/test_sum_kw.csv", index=False)

merged_kw["test"].to_csv(".temp/keywords/billsum/test_merged_kw.csv", index=False)
merged_kw["train"].to_csv(".temp/keywords/billsum/train_merged_kw.csv", index=False)