In [1]:
# create a pipeline from the etl_pipeline package
from pipeline_executor import PipelineExecutor

topic = 'quantum computing'

# create a pipeline executor
pipeline_executor = PipelineExecutor()
quantum_df = pipeline_executor.execute(query=topic, max_articles=2)

# print the first 5 rows of the dataframe
quantum_df.head()

Unnamed: 0,engine,link,se_description,se_source,n3k_author,n3k_published,title,body
0,Bing,https://www.popularmechanics.com/science/a4387...,"In fact, this is part of a whole movement call...",Popular Mechanics,[],,Quantum Computers Could Be a ‘Superhighway’ to...,"Comedian John Mulaney once said, “I don’t know..."
1,Bing,https://thenextweb.com/news/quantum-computing-...,A promising quantum computing ecosystem is eme...,The Next Web,[],,Europe’s precarious path to quantum computing ...,This article features an interview with Joe Fi...
2,Yahoo,https://www.benzinga.com/pressreleases/23/05/n...,Quantum Computing Inc. initiates commercializa...,Benzinga,[],,Quantum Computing Inc Announces First Quarter ...,Quantum Computing Inc. initiates commercializa...
3,Yahoo,https://seekingalpha.com/article/4603046-micro...,Quantum computing is a disruptive technology t...,Seeking Alpha,['Aseity Research'],2023-05-11 11:03:43-04:00,Microsoft Stock: Leading The Quantum Computing...,Bartlomiej Wroblewski\n\nQuantum computing is ...
4,Google,https://www.nature.com/articles/d41586-023-015...,,Nature,"['Castelvecchi', 'Davide Castelvecchi', 'You C...",,Physicists create long-sought topological quan...,Borromean rings depicted in a church in Floren...


In [2]:
# TODO extract entities from the dataframe descritpion, title, and body columns
# entities = quantum_df['description'].apply(lambda x: extract_entities(x))

entities = ['Microsoft', 'Apple', 'IBM']

In [3]:
# for each entity run the pipeline (adding "quantum" to the query) and store the results in a dictionary
entity_results = {}
for entity in entities:
    entity_results[entity] = pipeline_executor.execute(query=f"‘{entity}‘ AND ‘{topic}'", max_articles=100)


In [4]:
# check the results for the first entity
entity_results['Microsoft'].head()

Unnamed: 0,engine,link,se_description,se_source,n3k_author,n3k_published,title,body
0,Yahoo,https://seekingalpha.com/article/4603046-micro...,By leveraging the properties of quantum mechan...,Seeking Alpha,['Aseity Research'],2023-05-11 11:03:43-04:00,Microsoft Stock: Leading The Quantum Computing...,Bartlomiej Wroblewski\n\nQuantum computing is ...
1,Yahoo,https://www.benzinga.com/pressreleases/23/05/n...,Quantum Computing Inc. initiates commercializa...,Benzinga,[],,Quantum Computing Inc Announces First Quarter ...,Quantum Computing Inc. initiates commercializa...
2,Bing,https://phys.org/news/2023-05-google-quantum-a...,Our intuition tells us that it should be impos...,Phys.org,['Google Quantum Ai'],,Google Quantum AI braids non-Abelian anyons fo...,This article has been reviewed according to Sc...
3,Yahoo,https://www.techtarget.com/searchstorage/tip/A...,"In 2017, Microsoft surprised attendees of its ...",SearchSecurity.com,['Published'],,A primer on quantum computing storage and memo...,"In 2017, Microsoft surprised attendees of its ..."
4,Yahoo,https://www.tomshardware.com/news/quantinuum-i...,"Quantinuum, the trapped-ion specialist quantum...",Tom s Hardware,"['Francisco Pires', 'Freelance News Writer', '...",2023-05-10 19:52:20+00:00,Quantinuum Injects Topology Into Ion-Chain Qua...,"Quantinuum, the trapped-ion specialist quantum..."


In [5]:
# TODO summarize the results
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd

In [6]:
torch.__version__

'2.0.0'

In [7]:
# Setup device agnostic code (Chooses NVIDIA or Metal backend if available, otherwise defaults to CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    
else:
    device = torch.device("cpu")

device

device(type='mps')

In [8]:
microsoft_df = entity_results['Microsoft']
apple_df = entity_results['Apple']
ibm_df = entity_results['IBM']

In [9]:
microsoft_df.shape, apple_df.shape, ibm_df.shape

((182, 8), (149, 8), (166, 8))

In [34]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johnbergmann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
model.to(device)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

In [35]:
def summarize_bart(
    column: pd.Series, tokenizer: BartTokenizer, model: BartForConditionalGeneration, min_len=50, max_len=100) -> str:
    
    long_text = column.str.cat()

    sentences = nltk.tokenize.sent_tokenize(long_text)

    # initialize
    length = 0
    chunk = ""
    chunks = []
    count = -1
    for sentence in sentences:
        count += 1
        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

        if combined_length  <= 1024: # if it doesn't exceed  -tokenizer.max_len_single_sentence-
            chunk += sentence + " " # add the sentence to the chunk
            length = combined_length # update the length counter

            # if it is the last sentence
            if count == len(sentences) - 1:
                chunks.append(chunk) # save the chunk
            
        else: 
            chunks.append(chunk) # save the chunk
            # reset 
            length = 0 
            chunk = ""

            # take care of the overflow sentence
            chunk += sentence + " "
            length = len(tokenizer.tokenize(sentence))

    # inputs
    inputs = [tokenizer(chunk, return_tensors="pt").to(device) for chunk in chunks]

    # print summary
    outputs = []
    for input in tqdm(inputs):
        output = model.generate(**input, num_beams=2, min_length=min_len, max_length=max_len)
        outputs.append(tokenizer.decode(*output, skip_special_tokens=True))

    return " ".join(outputs)


In [27]:
# Generate summaries for Microsoft and Apple based on News Titles
microsoft_titles_summary = summarize_bart(microsoft_df["title"], tokenizer, model)
apple_titles_summary = summarize_bart(apple_df["title"], tokenizer, model)
ibm_titles_summary = summarize_bart(ibm_df["title"], tokenizer, model)

In [28]:
microsoft_titles_summary

"Microsoft signs power purchase deal with nuclear fusion company Helion. Microsoft just made a huge, far-from-certain bet on nuclear fusion. OpenAI is the No. 1 company on the 2023 CNBC Disruptor 50 list. Only Humility Can Save Us From AI. 7 Legacy Tech Stocks Primed for a Comeback. Google Chrome Brings Better AI Brains to the Web. Google's Bard Chatbot Opens to the Public. IBM, D-Wave Systems, Microsoft, Trinity, IBM and more team up to boost quantum tech community in Ireland. Microsoft is harnessing the power of the cloud to make the promise of quantum at scale a reality. Microsoft and Israeli startup Classiq collaborate to bring advanced quantum software to researchers and academic institutions. IBM, Microsoft, and Google Race to Close Quantum Skills Gap - US innovation agency Darpa turns to Microsoft for quantum computing project."

In [29]:
apple_titles_summary

"Apple to develop future Mac chips in Israel. Apple's M1 Ultra shows the Future of Computer Chips. Apple shows interest in RISC-V chips, a competitor to iPhones' Arm tech. Apple joins Amazon, Google, and Microsoft in tech industry layoffs. Apple's A15 Bionic chip powers iPhone 13 with 15 billion transistors. Microsoft researchers solve two 20-year-old problems in quantum computing. IBM Q announces a host of new tools geared towards making quantum computing more accessible. Sony launches 43-inch and 50-inch Bravia X70L TVs in India, price starts at Rs 59,900."

In [30]:
ibm_titles_summary

'IBM advances its quantum roadmap as competition heats up. IBM unveils end-to-end, quantum-safe tools to secure business, government  data. 10 companies building quantum computers. White House: President Biden to visit Westchester to talk debt limit and impacts.'

In [36]:
# Generate summaries for Microsoft and Apple based on News article bodies
microsoft_bodies_summary = summarize_bart(microsoft_df["body"], tokenizer, model)
apple_bodies_summary = summarize_bart(apple_df["body"], tokenizer, model)
ibm_bodies_summary = summarize_bart(ibm_df["body"], tokenizer, model)

100%|██████████| 275/275 [13:00<00:00,  2.84s/it]
100%|██████████| 169/169 [07:32<00:00,  2.68s/it]
100%|██████████| 207/207 [09:43<00:00,  2.82s/it]


In [58]:
# Summarize the long summary
def sub1024_summ(text: str, tokenizer: BartTokenizer, model: BartForConditionalGeneration, min_len=50, max_len=100) -> str:
    inputs = tokenizer([text], max_length=1024, return_tensors="pt").to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_len, max_length=max_len)
    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


In [65]:
sub1024_summ(microsoft_bodies_summary, tokenizer, model, min_len=100, max_len=300)

'Quantum computing is a disruptive technology that has the potential to revolutionize industries ranging from finance to healthcare. Microsoft stands out as a strong contender due to their innovative approach to quantum computing. IBM and Google are exploring superconducting qubits, while other companies like IonQ and Honeywell are experimenting with trapped ions. Microsoft is also pursuing topological qubits in its quantum computing work, a different way of going about quantum systems. QCI is engaged in a number of beta-tests to demonstrate the effectiveness of its computing capabilities.'

In [66]:
sub1024_summ(apple_bodies_summary, tokenizer, model, min_len=100, max_len=300)

"IonQ Announces Pat Tang as New VP of Research & Development (VP R&D) Tang joins IonQ with over 23 years of technology experience, most recently as an Engineering Vice President at Amazon Lab126. Tang has a diverse background that includes multi-touch sensor architecture, RF design and reliability engineering. Apple is opening a new development base in Israel to develop new Apple silicon chips for future Macs. Apple's new M2 Pro and M2 Max processors are 20% faster than their predecessors."

In [67]:
sub1024_summ(ibm_bodies_summary, tokenizer, model, min_len=100, max_len=300)

'Quantum computing is a disruptive technology that has the potential to revolutionize industries ranging from finance to healthcare. Google, IBM and other tech giants are developing quantum computers using superconductor qubits. Microsoft is also pursuing topological qubits in its quantum computing work. IBM has established an ecosystem of over 450,000 users with access to more than 20 quantum computers via the cloud. The more qubits that are quantum-mechanically linked together via entanglement, the more computations it can perform.'

In [None]:
# ngl... not the best summaries I think