In [10]:
from IPython.core.debugger import set_trace

import numpy as np 
from math import ceil, floor
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

import nltk
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
import re
from collections import Counter
import spacy, pytextrank

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from multiprocessing import cpu_count

import torch
from torch.nn import BCELoss, Module, Linear, AvgPool1d, MaxPool1d
from torch.nn.utils import clip_grad_norm_

import hdbscan
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import seaborn as sns

from tqdm.notebook import tqdm
import time, logging, sys, json

%run -i "../news-topic-cls/core/models/base.py"
%run -i "../news-topic-cls/core/models/extension.py"
%run -i "../news-topic-cls/core/data/data.py"
%run -i "../news-topic-cls/core/utils/optim.py"
%run -i "../news-topic-cls/core/utils/utils.py"

[nltk_data] Downloading package punkt to /Users/bradloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bradloff/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


No Apex Import


In [11]:
data_path = "../data/article_confirmed.csv"
if "test" in data_path:
    df = pd.read_csv(data_path).drop(labels="Unnamed: 0", axis=1)
else:
    df = pd.read_csv(data_path)

#df = clean_data(df)
df, label_encoder = encode_labels(df)

abbrev_mapping = get_abbreveation_mapping(label_encoder)
df = binarize_labels(df, abbrev_mapping)
df.head()

Unnamed: 0,id,text,AG,BF,CE,ED,EN,FU,GH,HA,...,IN,MC,NR,PM,PS,RE,SD,TP,UD,WS
0,29186,Development jobs in Peru: What you need to kno...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,29211,A turn to the slums; a call for alms “The weak...,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
2,29233,Claire Kupper: An aid veteran serving conflict...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29238,From Turkey to Peru Via UNDP: Tracking the Car...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,29241,"Tracy Morrison: Village Queen in Cameroon, Dev...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def clean_data(dataframe):
    id_list = [62094]
    cleaned_df = dataframe[~dataframe.id.isin(id_list)]
    """
    event_preview_ids = []
    for index, row in tqdm(dataframe.iterrows()):
        if row.text[:15] == "Events preview:":
            event_preview_ids.append(row.id)
    long_enumeration_ids = [94003]
    cleaned_df = cleaned_df[~cleaned_df.id.isin(long_enumeration_ids + event_preview_ids)]
    """
    cleaned_df.replace("<.*>", "", regex=True, inplace=True)
    return cleaned_df

In [13]:
df = clean_data(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [6]:
lm_path = "../data/models/language_models/roberta-lm"

In [7]:
config_name = "roberta-base"
transformer = TransformerOptions(config_name, lm_path=lm_path)

In [8]:
for param in transformer.model.parameters():
    param.requires_grad = False

In [9]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank(logger=None)
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

In [10]:
def compute_embeddings(tokenized_sentences: list, language_model):
    language_model.eval()
    for step, tokens in enumerate(tokenized_sentences):
        outputs = language_model(tokens)
        # set_trace()
        embedding = outputs[0][:, 0, :]
        if step == 0:
            embeddings = embedding.numpy()
        else:
            embeddings = np.concatenate((embeddings, embedding), axis=0)
    return embeddings


def get_sentences_for_summary(summary_df, cluster_center):
    summary_sentences = [0]*len(cluster_center)

    for index, center in enumerate(cluster_center):
        is_cluster = summary_df.cluster == index
        cluster_df = summary_df[is_cluster]
        min_distance = np.Inf
        for _, row in cluster_df.iterrows():
            distance = np.linalg.norm(center-row.embedding)

            if distance < min_distance:
                min_distance = distance
                summary_sentences[index] = (row.position, row.sentence)
    return sorted(summary_sentences, key=lambda x: x[0])


def get_row_of_point(summary_df, point):
    for _, row in summary_df.iterrows():
        if np.all(row.embedding == point):
            return row


def get_sentences_for_summary_by_exemplar(summary_df, exemplars_):
    exemplars = exemplars_.copy()
    number_clusters = len(exemplars)
    summary_sentences = []

    while len(summary_sentences) < 10:
        summary_sentences_before_loop = summary_sentences.copy()
        for index, exemplar in enumerate(exemplars):
            if len(exemplar) == 0:
                continue
            point = exemplar[0]

            row = get_row_of_point(summary_df, point)
            # set_trace()
            summary_sentences.append((row.position, row.sentence))

            exemplar = exemplar[1:]
            exemplars[index] = exemplar

            if len(summary_sentences) == 10:
                break
        if summary_sentences == summary_sentences_before_loop:
            break

    index = 0
    while len(summary_sentences) < 10:
        summary_sentences_before_loop = summary_sentences.copy()
        for cluster in range(number_clusters):
            cluster_points = summary_df[summary_df.cluster == cluster]
            try:
                row = cluster_points.iloc[index, :]
                summary_sentences.append((row.position, row.sentence))
            except IndexError:
                continue
            if len(summary_sentences) == 10:
                break
        if summary_sentences == summary_sentences_before_loop:
            break
        else:
            index += 1

    # set_trace()
    if len(summary_sentences) < 10:
        outlier_df = summary_df[summary_df.cluster == -1]
        for _, row in outlier_df.iterrows():
            summary_sentences.append((row.position, row.sentence))
            if len(summary_sentences) == 10:
                return sorted(summary_sentences, key=lambda x: x[0])
    else:
        return sorted(summary_sentences, key=lambda x: x[0])


def find_quotes(sentence):
    condition = re.compile(".*?”")
    matches = condition.finditer(sentence)
    quotes = [match.group().strip() for match in matches]
    if quotes == []:
        return [sentence]
    last_match = quotes[-1]
    lower_bound = sum([len(quote) for quote in quotes]) - len(last_match)
    last_match_end = sentence.find(last_match, lower_bound) + len(last_match)
    after_quotes = sentence[last_match_end:].strip()
    if after_quotes != "":
        quotes.append(after_quotes)
    return quotes


def sentenize(article):
    sentences = nltk.tokenize.sent_tokenize(article)
    for index, sentence in enumerate(sentences):
        quotes = find_quotes(sentence)
        sentences[index] = quotes

    def flatten(list_): return [item for sublist in list_ for item in sublist]
    return flatten(sentences)


def sentenize_without_quotes(article):
    article_cleaned = article.replace("“", "")
    article_cleaned = article_cleaned.replace("”", "")

    return nltk.tokenize.sent_tokenize(article_cleaned)


def postprocess_sentences(sentences):
    for index, sentence in enumerate(sentences):
        if sentence.count(",") > 15:
            tokens = nltk.word_tokenize(sentence)
            word_counter = len(tokens)
            if word_counter >= 200:
                sentences[index] = "List of upcoming events."

    return sentences


def cluster_embeddings_kmeans(sentences, sentence_embeddings, n_cluster):
    clusterer = KMeans(n_clusters=n_cluster)
    clusterer.fit(sentence_embeddings)

    summary_df = pd.DataFrame(data={
        "position": range(len(sentences)),
        "sentence": sentences,
        "embedding": sentence_embeddings.tolist(),
        "cluster": clusterer.labels_
    })
    return summary_df, clusterer.cluster_centers_


def cluster_embeddings_hdbscan(sentences, sentence_embeddings):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=None)
    clusterer.fit(sentence_embeddings)

    summary_df = pd.DataFrame(data={
        "position": range(len(sentences)),
        "sentence": sentences,
        "embedding": sentence_embeddings.tolist(),
        "cluster": clusterer.labels_
    })

    return summary_df, clusterer.exemplars_


def summarize(article: str, transformer, cluster_alg: str = "hdbscan", n_cluster=None):
    sentences = sentenize_without_quotes(article)
    if len(sentences) <= 10:
        return article

    tokenized_sentences = [transformer.tokenizer(text=sentence,
                                                 truncation=True,
                                                 return_tensors="pt")["input_ids"]
                           for sentence in sentences]

    with torch.no_grad():
        sentence_embeddings = compute_embeddings(
            tokenized_sentences, transformer.model)

    if cluster_alg == "kmeans":
        assert n_cluster is not None
        summary_df, cluster_centers = cluster_embeddings_kmeans(sentences,
                                                                sentence_embeddings,
                                                                n_cluster=10)
        summary_sentences = get_sentences_for_summary(
            summary_df, cluster_centers)

    elif cluster_alg == "hdbscan":
        summary_df, cluster_exemplars = cluster_embeddings_hdbscan(sentences,
                                                                   sentence_embeddings)
        summary_sentences = get_sentences_for_summary_by_exemplar(
            summary_df, cluster_exemplars)

    return " ".join([sentence for _, sentence in summary_sentences])


def textrank_summary(article, processor):
    summ = processor(article)
    spacy_spans = summ._.textrank.summary(
        limit_phrases=None, limit_sentences=10)
    summary_sentences = [spacy_span.text for spacy_span in spacy_spans]
    return " ".join(summary_sentences)

### Single Article

In [15]:
is_id = df.id == 96632
article = df[is_id].text.values[0]
sentences = sentenize_without_quotes(article)

In [10]:
print("Number of sentences: {}".format(len(sentences)))
print("___________________________")
print("Sentences:")
for sentence in sentences:
    print(sentence)
    if sentence=="":
        print("Empty Sentence")
    print("\n")
print("___________________________")
print("Labels:")
for key, value in df[is_id].iloc[0].iteritems():
    if value==1:
        print(abbrev_mapping[key])

Number of sentences: 24
___________________________
Sentences:
Facebook fights fake coronavirus news, wants to build trust with public sector SAN FRANCISCO — With its power to influence global health information, Facebook is now helping fight fake coronavirus news.


Following the emergence of the new coronavirus, COVID-19, and the proliferation of bad information about it, the social media giant has already begun removing bogus information about cures and prevention methods.


Facebook is also looking to partnerships with the public sector to replace this misinformation with helpful, credible alternatives, said Kang-Xing Jin, Facebook's head of health, who spoke at Devex’s Prescription for Progress event on technology for health in San Francisco, California, earlier this month.


We need to help get people access to credible information from trusted sources in the moments that they’re seeking it out.


--— Kang-Xing Jin, head of health, Facebook We’re working really closely with organ

### Computing single Summary

In [13]:
summary_df = pd.read_csv("../data/article_confirmed_summary.csv")

In [17]:
start = time.time()
summary_cluster = summarize(article=article, cluster_alg="hdbscan", transformer=transformer)
stop = time.time()
print("Time {}".format(stop-start))
print("Summary: \n{}\n".format(summary_cluster))

Token indices sequence length is longer than the specified maximum sequence length for this model (945 > 512). Running this sequence through the model will result in indexing errors


Time 1.3889808654785156
Summary: 
Researchers were clear on one point though: There has not been enough good research into potential links between malaria prophylactics and a variety of health outcomes experienced by people who have taken them. The evidence base to make a determination is really quite limited, but as far as the question being a reasonable and appropriate one to ask, we think that is the case, Savitz told Devex. In addition to military personnel and others deployed overseas, the researchers looked at available research on malaria drug use and effects among U.S. Peace Corps volunteers, many of whom serve in malaria-endemic countries, and who are required to take malaria prevention medication. Approximately 10% of malaria cases reported annually in U.S. citizens by the Centers for Disease Control and Prevention occur in Peace Corps volunteers, according to the report. Each of these comes with recommendations for who should take them and the concurrent and short-term adver

In [18]:
start = time.time()
summary_textrank = textrank_summary(article, nlp)
stop = time.time()
print("Time {}".format(stop-start))
print("Summary: \n{}\n".format(summary_textrank))

Time 0.19610881805419922
Summary: 
In addition to military personnel and others deployed overseas, the researchers looked at available research on malaria drug use and effects among U.S. Peace Corps volunteers, many of whom serve in malaria-endemic countries, and who are required to take malaria prevention medication. The researchers looked at the available evidence for associations between specific malaria drugs and persistent health outcomes — those that appear during the period of the medication and persist after the course has ended — as well as latent health outcomes — those that appear after use is completed. Approximately 10% of malaria cases reported annually in U.S. citizens by the Centers for Disease Control and Prevention occur in Peace Corps volunteers, according to the report. The review was only able to find sufficient evidence between one malaria medication and one persistent health outcome, and it should probably not be cause for great concern, Savitz said. The study, p

In [17]:
tokenized_summary = transformer.tokenizer.encode(summary_textrank)
print("Length tokenized Summary: {}".format(len(tokenized_summary)))

Length tokenized Summary: 462


In [30]:
summary_row.id = 96627
summary_row.text = article
summary_row.summary_cluster = summary_cluster
summary_row.summary_textrank = summary_textrank
summary_row.loc[list(abbrev_mapping.keys())] = row.loc[list(abbrev_mapping.keys())]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [33]:
summary_df.iloc[15,:] = summary_row

In [35]:
summary_df.to_csv("../data/article_confirmed_roberta-lm_summary.csv", index=False)

### Summaries

In [None]:
summaries = pd.read_csv("../data/test_data_summary.csv")

In [None]:
is_id = summaries.id == 29211
print(summaries[is_id].summary.values[0])

In [None]:
tokenized_length = []
invalid_ids = []
long_summary_ids = []
for index, row in summaries.iterrows():
    """
    print(row.text)
    print("\n")
    print(row.summary)
    print("\nLabels:")
    for key, value in row.iteritems():
        if value==1:
            print(abbrev_mapping[key])
    break
    """
    try:
        tokenized_summary = transformer.tokenizer.encode(row.summary)
        tokenized_length.append(len(tokenized_summary))
        if len(tokenized_summary)>511:
            long_summary_ids.append(row.id)
    except:
        invalid_ids.append(row.id)        

In [None]:
len(long_summary_ids)

In [None]:
invalid_ids

## Data Quality

In [None]:
articles = df.text.values
for index, row in df.iterrows():
    article = row.text
    sentences = sentenize(article)
    for sentence in sentences:
        if sentence.count(",") > 15:
            tokens = nltk.word_tokenize(sentence)
            word_counter = len(tokens)
            if word_counter >= 200:
                print(row.id)
                print(sentence)
                print("\nCommas: {}".format(sentence.count(",")))
                print("\nLabels:")
                for key, value in row.iteritems():
                    if value==1:
                        print(abbrev_mapping[key])
                print("\n")
                tokens = nltk.word_tokenize(sentence)
                text = nltk.Text(tokens)
                tagged = nltk.pos_tag(text)
                counts = Counter(tag for word,tag in tagged)
                verb_counter = counts["VB"] + counts["VBD"] + counts["VBG"] + counts["VBN"] + counts["VBP"] + counts["VBZ"] 
                print("Verb Counter: {}".format(verb_counter))
                print("Word Counter: {}".format(word_counter))
                print("_________________________\n")

In [None]:
len("Events preview:")

In [None]:
counter=0
for index, row in df.iterrows():
    article = row.text
    if "Events preview:" in article:
        counter +=1
        """
        sentences = sentenize(article)
        print(row.id)
        print("Number of Sentences: {}".format(len(sentences)))
        for sentence in sentences:
            print("\n")
            print(sentence)
        print("\nLabels:")
        for key, value in row.iteritems():
            if value==1:
                print(abbrev_mapping[key])
                if abbrev_mapping[key]=="Media And Communications":
        print("_______________________________\n")
        """
print(counter)