In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
# create a pipeline from the etl_pipeline package
from pipeline_executor import PipelineExecutor
from nlp_analysis.NER import extract_entities
topic = 'Quantum Computing' # "quantumcomputing"AND"research"

# create a pipeline executor
pipeline_executor = PipelineExecutor()
quantum_df = pipeline_executor.execute(query=topic, max_articles=999, overwrite=True)

# print the first 5 rows of the dataframe
quantum_df.head()

Getting news article info:   0%|          | 0/484 [00:00<?, ?it/s]encoding error : input conversion failed due to input error, bytes 0x21 0x00 0x00 0x00
encoding error : input conversion failed due to input error, bytes 0x44 0x00 0x00 0x00
I/O error : encoder error
Getting news article info:   3%|▎         | 15/484 [00:02<01:04,  7.26it/s]encoding error : input conversion failed due to input error, bytes 0x21 0x00 0x00 0x00
encoding error : input conversion failed due to input error, bytes 0x44 0x00 0x00 0x00
I/O error : encoder error
Getting news article info:  11%|█▏        | 55/484 [00:07<00:38, 11.27it/s]encoding error : input conversion failed due to input error, bytes 0x21 0x00 0x00 0x00
encoding error : input conversion failed due to input error, bytes 0x44 0x00 0x00 0x00
I/O error : encoder error
Getting news article info:  14%|█▍        | 67/484 [00:10<01:16,  5.48it/s]encoding error : input conversion failed due to input error, bytes 0x21 0x00 0x00 0x00
encoding error : input

Unnamed: 0,article_index,engine,link,source,title,description,body,paragraph
25,1,Google,https://www.nytimes.com/2023/06/14/science/ibm...,The New York Times,"Quantum Computing Advance Begins New Era, IBM ...",Quantum computers today are small in computati...,Quantum computers today are small in computati...,Quantum computers today are small in computati...
26,1,Google,https://www.nytimes.com/2023/06/14/science/ibm...,The New York Times,"Quantum Computing Advance Begins New Era, IBM ...",Quantum computers today are small in computati...,Quantum computers today are small in computati...,But with their intrinsic ability to consider m...
27,1,Google,https://www.nytimes.com/2023/06/14/science/ibm...,The New York Times,"Quantum Computing Advance Begins New Era, IBM ...",Quantum computers today are small in computati...,Quantum computers today are small in computati...,“What IBM showed here is really an amazingly i...
28,1,Google,https://www.nytimes.com/2023/06/14/science/ibm...,The New York Times,"Quantum Computing Advance Begins New Era, IBM ...",Quantum computers today are small in computati...,Quantum computers today are small in computati...,While researchers at Google in 2019 claimed th...
31,1,Google,https://www.nytimes.com/2023/06/14/science/ibm...,The New York Times,"Quantum Computing Advance Begins New Era, IBM ...",Quantum computers today are small in computati...,Quantum computers today are small in computati...,"Present-day computers are called digital, or c..."


In [None]:
import pandas as pd

df = pd.read_csv('data/clean/QuantumComputing_20.csv')

In [None]:
df.head()

In [None]:
from nlp_analysis.word_wizard import WordWizard

# create a word wizard
word_wizard = WordWizard(df)
word_wizard.create_word_embeddings(columns=['body'])

In [None]:
word_wizard.cluster_embeddings(column='body', method='silhouette')
word_wizard.df.head()

In [None]:
word_wizard.df['body_word_embeddings_cluster_13_is_medoid'].value_counts()

# The Rest

In [None]:
# for each entity run the pipeline (adding "quantum" to the query) and store the results in a dictionary
entity_results = {}
for entity in entities:
    entity_results[entity] = pipeline_executor.execute(query=f"'{entity}' AND '{topic}'", max_articles=100)


In [None]:
# check the results for the first entity
entity_results['Microsoft'].head()

In [None]:
# TODO summarize the results
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd

In [None]:
torch.__version__

In [None]:
# Setup device agnostic code (Chooses NVIDIA or Metal backend if available, otherwise defaults to CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    
else:
    device = torch.device("cpu")

device

In [None]:
microsoft_df = entity_results['Microsoft']
apple_df = entity_results['Apple']
ibm_df = entity_results['IBM']

In [None]:
microsoft_df.shape, apple_df.shape, ibm_df.shape

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

import nltk

nltk.download('punkt')

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

model.to(device)

In [None]:
def summarize_bart(
    column: pd.Series, tokenizer: BartTokenizer, model: BartForConditionalGeneration, min_len=50, max_len=100) -> str:
    
    long_text = column.str.cat()

    sentences = nltk.tokenize.sent_tokenize(long_text)

    # initialize
    length = 0
    chunk = ""
    chunks = []
    count = -1
    for sentence in sentences:
        count += 1
        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

        if combined_length  <= 1024: # if it doesn't exceed  -tokenizer.max_len_single_sentence-
            chunk += sentence + " " # add the sentence to the chunk
            length = combined_length # update the length counter

            # if it is the last sentence
            if count == len(sentences) - 1:
                chunks.append(chunk) # save the chunk
            
        else: 
            chunks.append(chunk) # save the chunk
            # reset 
            length = 0 
            chunk = ""

            # take care of the overflow sentence
            chunk += sentence + " "
            length = len(tokenizer.tokenize(sentence))

    # inputs
    inputs = [tokenizer(chunk, return_tensors="pt").to(device) for chunk in chunks]

    # print summary
    outputs = []
    for input in tqdm(inputs):
        output = model.generate(**input, num_beams=2, min_length=min_len, max_length=max_len)
        outputs.append(tokenizer.decode(*output, skip_special_tokens=True))

    return " ".join(outputs)


In [None]:
# Generate summaries for Microsoft and Apple based on News Titles
microsoft_titles_summary = summarize_bart(microsoft_df["title"], tokenizer, model)
apple_titles_summary = summarize_bart(apple_df["title"], tokenizer, model)
ibm_titles_summary = summarize_bart(ibm_df["title"], tokenizer, model)

In [None]:
microsoft_titles_summary

In [None]:
apple_titles_summary

In [None]:
ibm_titles_summary

In [None]:
# Generate summaries for Microsoft and Apple based on News article bodies
microsoft_bodies_summary = summarize_bart(microsoft_df["body"], tokenizer, model)
apple_bodies_summary = summarize_bart(apple_df["body"], tokenizer, model)
ibm_bodies_summary = summarize_bart(ibm_df["body"], tokenizer, model)

In [None]:
# Summarize the long summary
def sub1024_summ(text: str, tokenizer: BartTokenizer, model: BartForConditionalGeneration, min_len=50, max_len=100) -> str:
    inputs = tokenizer([text], max_length=1024, return_tensors="pt").to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_len, max_length=max_len)
    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


In [None]:
sub1024_summ(microsoft_bodies_summary, tokenizer, model, min_len=100, max_len=300)

In [None]:
sub1024_summ(apple_bodies_summary, tokenizer, model, min_len=100, max_len=300)

In [None]:
sub1024_summ(ibm_bodies_summary, tokenizer, model, min_len=100, max_len=300)

In [None]:
# ngl... not the best summaries I think