In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TRANSFORMERS_CACHE'] = "c:\\huggingface\\.cache\\"

import requests
import spacy
from googlesearch import search
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from transformers import pipeline

In [2]:
### Helper Functions

# Open Url and parse with bs4
def parse_url(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    return soup

# Extract all paragraph elements from bs4 parsed soup
def extract_paragraphs(parsed_soup):
    text = ""
    for para in parsed_soup:
        text += para.get_text()
    return text

# Extracts alls paragraph data from a url
def extract_data_from_link(url):
    soup = parse_url(url)
    text = extract_paragraphs(soup('p'))
    return text

In [3]:
# This is model 1

def read_data(data):
    article = data.split(". ")
    sentences = []
    
    for sentence in article:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(word_data, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_data(word_data)
    #print(sentences)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    #print("Summarize Text:\n", ". ".join(summarize_text))
    return summarize_text

# let's begin
#generate_summary(text_repo[1], 2)

In [4]:
## Setting to use the 0th GPU
os.environ["CUDA_VISIBLE_DEVICES"] = ""

## Setting to use the bart-large-cnn model for summarization
summarizer = pipeline("summarization")
#summarizer = pipeline("summarization", model="t5-3b", tokenizer="t5-3b")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)
[WinError 3] The system cannot find the path specified: 'e:\\'


OSError: Can't load config for 'sshleifer/distilbart-cnn-12-6'. Make sure that:

- 'sshleifer/distilbart-cnn-12-6' is a correct model identifier listed on 'https://huggingface.co/models'
  (make sure 'sshleifer/distilbart-cnn-12-6' is not a path to a local directory with something else, in that case)

- or 'sshleifer/distilbart-cnn-12-6' is the correct path to a directory containing a config.json file



In [None]:
# Search Query
query = "$RIG stock news"

# Save query results
results = []
for j in search(query, tld="com", num=10, stop=10, pause=1, tbs="qdr:w"):
    results.append(j)
print(results)

['https://finance.yahoo.com/quote/RIG/news/', 'https://finance.yahoo.com/quote/RIG/community/', 'https://www.marketbeat.com/stocks/NYSE/RIG/news/', 'https://www.marketbeat.com/stocks/NYSE/RIG/', 'https://stockinvest.us/stock/RIG', 'https://www.marketwatch.com/investing/stock/rig/analystestimates', 'https://www.marketwatch.com/investing/stock/rig/charts', 'https://www.nasdaq.com/market-activity/stocks/rig/insider-activity', 'https://investorshub.advfn.com/Transocean-Ltd-RIG-6633/', 'https://www.barchart.com/stocks/quotes/RIG/technical-analysis']


In [None]:
text_repo = []
one_text = """"""
for url in results:
    paragraph_data = extract_data_from_link(url)
    text_repo.append(paragraph_data)
    one_text += paragraph_data
#print(one_text)

In [None]:
extractive_summary = generate_summary(one_text, 5)
print(extractive_summary)

In [None]:
summary_text = summarizer(extractive_summary, max_length=1000, min_length=50)[0]['summary_text']
#summary_text = summarizer(extractive_summary, max_length=1000, min_length=5, truncation=True)[0]['summary_text']
print(summary_text)

query:  "Ukraine Russia United States"

Result : "President Putin says Nato is making "dangerous attempts to take over Ukrainian territory" President Biden did not make any guarantees to limit Nato expansion, the Kremlin says . Ukraine's foreign minister says talks enabled "deterrence and de-escalation"

Query: covid omicron

Result
It is not yet clear whether infection with Omicron causes more severe disease compared to infections with other variants, including Delta . The number of people testing positive has risen in areas of South Africa affected by this variant .

Query: "Latest cyber crime trends"

Result:
The accessibility of malware for malicious parties is a growing concern . We may see more disease or health-related phishing or malware attacks in future using scaremongering tactics .

Query: "APT 28 Russia"

Result:
Fancy Bear is thought to be responsible for cyber attacks on the German parliament, the Norwegian parliament, French television station TV5Monde and the White House . The group promotes the political interests of the Russian government, and is known for hacking Democratic National Committee emails .

Query: "APT 28 TTPs"

Result
Fancy Bear is thought to be responsible for cyber attacks on the German parliament, the Norwegian parliament, French television station TV5Monde and the White House . The group promotes the political interests of the Russian government, and is known for hacking Democratic National Committee emails .

Query: "amazon warehouse"

Result: First responders were called to the warehouse after reports of people trapped inside . Up to 100 people were believed to be working the night shift at the time of the collapse . Amazon is assessing the damage at its facility and said the safety and well-being of its employees and partners is its top priority .

query = "news in san antonio"

Result: Nearly 100 Texas A&M San Antonio students have been living in hotel rooms this semester because of a shortage of dorms on campus . A psychiatrist offers tips to maintain the mental health of parents who have missing children . City officials say they can collaborate with CPS Energy to help each other better protect against cyberattacks .

query: "log4j2 vulnerability"

Result: CERT New Zealand warns that it's already being exploited in the wild . Java logging library, Apache Log4j between versions 2.0 and 2.14.1 are all affected .

query: "log4j2 exploit location"

Result: Log4j2 is an open-source, Java-based logging framework commonly incorporated into Apache web servers . According to public sources, Chen Zhaojun of Alibaba officially reported a Log4J2 remote code execution (RCE) vulnerability to Apache on Nov. 1 .
 
query: "how to patch log4j2"

Result: The Apache Software Foundation has released an emergency security update to fix a zero-day vulnerability in Log4j . The vulnerability, also nicknamed Log4Shell, can be exploited by forcing Java-based apps and servers to log a specific string into their internal systems . This string can force the vulnerable system to download and run a malicious script from an attacker-controlled domain .

query: "tornados US"

Result:  Five states, Arkansas, Illinois, Kentucky, Missouri and Tennessee were hit by tornadoes on Friday night . The Edwardsville Police Department in Illinois said the storms had resulted in “catastrophic damage to a significant portion” of an Amazon warehouse .

In [None]:
question_answerer = pipeline('question-answering')
question = "What countries are sending troops to Ukraine?"

answer = question_answerer({
    'question':question,
    'context': one_text
})

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


In [None]:
print(answer['answer'])

Ukraine, Poland, and Lithuania


In [None]:
answer

{'score': 0.9599509239196777,
 'start': 55443,
 'end': 55473,
 'answer': 'Ukraine, Poland, and Lithuania'}