# Text Summarization

In [1]:
import os
import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm
from nltk import sent_tokenize
from itertools import groupby
from operator import itemgetter
warnings.filterwarnings("ignore")

from src.summarizers.textrank import KeySentenceExtractor
from src.summarizers.bartplm import BartSummarizer

In [5]:
def combine_summaries(ext_itos, abt_itos):
    """
    Args:
        ext_itos (dict): extracted summaries with format "index_to_sentence"
        abt_itos (dict): abstracted summaries with format "index_to_sentence"
    Func:
        Before combining two summaries, exclude the sentences in extracted sentences which are:
        1) length of words in a sentence less than 5
        2) incomplete quoted sentences
        3) duplicated sentences compared with abstracted summaries

        Before combining two summaries, if the sentences are not consecutive in order,
        add "..." between the sentences
    """

    _ext_itos = {}
    for idx, sent in ext_itos.items():
        if len(sent.split(" ")) > 5:
            if sent.find('\"') != -1:
                if sent.count('\"') >= 2:
                    continue
            else:
                _ext_itos[idx] = sent

    for idx, sent in _ext_itos.items():
        if abt_itos.get(idx) is None:
            abt_itos[idx] = sent

    summary = []
    summary_itos = dict(sorted(abt_itos.items(), key=lambda x:x[0]))

    indices = list(summary_itos.keys())
    for k, g in groupby(enumerate(indices), lambda ix: ix[0] - ix[1]):
        chunk = list(map(itemgetter(1), g))
        chunk_sent = " ".join([summary_itos.get(idx) for idx in chunk])
        summary += [chunk_sent]

    summary = " ... ".join(summary)

    return summary_itos, summary

In [6]:
doc = '''São Paulo – "The United Arab Emirates took in February for the first time the lead in poultry imports from Brazil, industry association ABPA reported." The Gulf country surpassed China, which had been the leading poultry buyer until then. The UAE acquired 42,800 tonnes in poultry, up 89.9% year on year. ``The UAE gained prominence in exports from Brazil in the last couple of months, which was crucial, along with the increase in sales to Mexico and the European Union,`` ABPA Markets director Luís Rua was quoted as saying in a statement regarding February exports. According to Rua, the level of purchases from these regions is expected to be maintained over the following months, particularly because Ukraine, which is a strong competitor for Brazil in poultry supply to destinations like the EU, Saudi Arabia and the Gulf states, is likely to export lesser volumes due to the war with Russia. Overall poultry exports from Brazil stepped up 7.4% in February to 348,800 tonnes, including raw and processed products. In revenue, exports grossed USD 663 million last month, up 27.1%. China, now the second largest importer of poultry from Brazil, brought in 42,300 tonnes last month, down 8.4% year on year. Third came South Africa at 30,700 tonnes. ABPA also highlighted sales to the Mexico, up 358% to 19,600 tonnes, and the EU at 16,500 tonnes, up 35.1%. In the first two months of the year, poultry exports reached 723,700 tonnes, up 13% year on year to 640,400 tonnes. Revenue was up 33.9% to USD 1.280 billion from USD 956.1 million one year ago. Translated by Guilherme Miranda'''

In [7]:
textrank_summarizer = KeySentenceExtractor()
bart_summarizer = BartSummarizer()

# doc = data[data.language_id == "en"].sample(n=1).raw_body.iat[0]
tok_doc = sent_tokenize(doc)

ext_itos = textrank_summarizer.summarize(
    doc=doc,
    embedding = True,
    directed = True,
    min_sim = .2,
    pos_emph = None,
    emph_rate = None,
    max_epoch = 100,
    damping_factor = .85,
    early_stop = True,
    topk = 2
)
# |ext_itos| = {idx:sent, ...}

abt_itos = bart_summarizer.summarize(
    doc,
    num_beams=2,
    length_penalty=2.0,
    no_repeat_ngram_size=4
)
# |abt_itos| = {idx:sent, ...}

print(f'''
|extractive_body|
{ext_itos}''')

print(f'''
|abstractive_body|
{abt_itos}''')

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]


|extractive_body|
{0: 'São Paulo – "The United Arab Emirates took in February for the first time the lead in poultry imports from Brazil, industry association ABPA reported."', 1: 'The Gulf country surpassed China, which had been the leading poultry buyer until then.'}

|abstractive_body|
{2: 'The UAE acquired 42,800 tonnes in poultry, up 89.9% year on year.', 7: 'China, now the second largest importer of poultry from Brazil, brought in 42,300 tonnes last month.', 5: 'Overall poultry exports from Brazil stepped up 7.4% in February to 348,800 tonnes.'}


In [8]:
summary_itos, summary = combine_summaries(ext_itos, abt_itos)
print(f'''
|body|
{doc}

|summary itos|
{summary_itos}

|summary|
{summary}''')


|body|
São Paulo – "The United Arab Emirates took in February for the first time the lead in poultry imports from Brazil, industry association ABPA reported." The Gulf country surpassed China, which had been the leading poultry buyer until then. The UAE acquired 42,800 tonnes in poultry, up 89.9% year on year. ``The UAE gained prominence in exports from Brazil in the last couple of months, which was crucial, along with the increase in sales to Mexico and the European Union,`` ABPA Markets director Luís Rua was quoted as saying in a statement regarding February exports. According to Rua, the level of purchases from these regions is expected to be maintained over the following months, particularly because Ukraine, which is a strong competitor for Brazil in poultry supply to destinations like the EU, Saudi Arabia and the Gulf states, is likely to export lesser volumes due to the war with Russia. Overall poultry exports from Brazil stepped up 7.4% in February to 348,800 tonnes, including 

# Language Identification

In [1]:
from src.preprocessors.lang_identifier import FasttextIdentifier

fasttext = FasttextIdentifier()
MAX_LENGTH = 128
text = "안녕하세요."
lang = fasttext.predict_lang(text)[0][0].split("__label__")[1]
print(lang)

ko




# Query Data

In [None]:
from src.loaders.loader import QueryLoader
from src.loaders.engine import Engine, SnowflakeEngine

fn = "./data" + "/" + "item.tsv"
if os.path.exists(fn):
    item_data = pd.read_csv(fn, sep="\t")
else:
    # get data from DB
    SQL = QueryLoader().fetch["item"]
    engine = Engine(SQL)
    engine.connect_database()
    item_data = engine.fetch_data()

    item_data.to_csv(fn, sep="\t", index=False)
print("Number of items before processing: ", len(item_data.item_id.unique()))


fn = "./data" + "/" + "price_3y.tsv"
if os.path.exists(fn):
    price_data = pd.read_csv(fn, sep="\t")
else:
    # get data from DB
    SQL = QueryLoader().fetch["price_3y"]
    engine = Engine(SQL)
    engine.connect_database()
    price_data = engine.fetch_data()

    price_data.to_csv(fn, sep="\t", index=False)
print("Number of entries before processing: ", len(price_data.entry_id.unique()))