In [101]:
from bs4 import BeautifulSoup
import requests

url1 = 'https://www.reuters.com/news/archive/technologynews?view=page&page=1&pageSize=10'
url2 = 'https://www.reuters.com/news/archive/technologynews?view=page&page=2&pageSize=10'
url3 = 'https://www.reuters.com/news/archive/technologynews?view=page&page=3&pageSize=10'
url4 = 'https://www.reuters.com/news/archive/technologynews?view=page&page=4&pageSize=10'
url5 = 'https://www.reuters.com/news/archive/technologynews?view=page&page=5&pageSize=10'

def web_scraping_news(url):
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html,'lxml')
    div_tag = soup.find_all('h3',attrs={'class':"story-title"})
    news = [i for i in div_tag]

    return news

In [102]:
news1 = web_scraping_news(url1)
news2 = web_scraping_news(url2)
news3 = web_scraping_news(url3)
news4 = web_scraping_news(url4)
news5 = web_scraping_news(url5)
news = news1 + news2 + news3 + news4 + news5
titles = [str(i) for i in news]

#Extract News Headline
headlines = [a.strip(''''<h3 class="story-title">
								''').rstrip('</') for a in titles]

for headline in headlines:
    print(headline)


Elon Musk's banter with Robinhood CEO triggers stampede for Clubhouse app
Sony raises outlook amid home entertainment boom, but struggles to build more PS5s
Ant Group reaches deal with China regulators on restructuring, Bloomberg News reports
Regulators to meet as brokers call time on Reddit-trader rollercoaster
GameStop, AMC tumble as retail trading mania cools
SpaceX Starship prototype rocket explodes on landing after test launch
Sweden's Embracer expands reach with $2.5 billion game buying spree
Exclusive: Suspected Chinese hackers used SolarWinds bug to spy on U.S. payroll agency – sources
Indian trade secretary stands by digital tax opposed by U.S.
Bezos to give Amazon reins to cloud boss Jassy as sales rocket past $100 billion
Biden moves to reverse Trump immigration policies, too slowly for some
House Republican lawmakers seek to remove Omar from committee assignments - Fox News
Trump lawyers challenge legitimacy of post-presidency impeachment trial
China's Lenovo posts record p

In [103]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [113]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['title']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

def predict(text, data, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = data.iloc[idx_array]['title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [114]:
data = pd.DataFrame(headlines,columns= ['title'])
data.head()

Unnamed: 0,title
0,Elon Musk's banter with Robinhood CEO triggers...
1,Sony raises outlook amid home entertainment bo...
2,Ant Group reaches deal with China regulators o...
3,Regulators to meet as brokers call time on Red...
4,"GameStop, AMC tumble as retail trading mania c..."


In [115]:
forest = get_forest(data, 100)

It took 0.06586408615112305 seconds to build forest.


In [117]:
title = 'Using a neural net to instantiate a deformable model'
result = predict(title, data, 100, 10, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.0029921531677246094 seconds to query forest.

 Top Recommendation(s) is(are) 
 37    Despite 'productive' Republican meeting, Biden...
7     Exclusive: Suspected Chinese hackers used Sola...
10    Biden moves to reverse Trump immigration polic...
15    Explainer: How a GameStop share pullback could...
49    House Democrat urges Biden to declare white su...
61    U.S. Congress to hold hearings on GameStop tra...
Name: title, dtype: object
