In [3]:
import numpy as np
import pandas as pd
import re
#import time
from datasketch import MinHash, MinHashLSHForest

In [4]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [5]:
def get_forest(data, perms):
    #start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    #print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [6]:
def predict(text, database, perms, num_results, forest):
    #start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]
    
    #print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [7]:
db = pd.read_csv('Final_Reformatted_Clean_Corpus.csv')
permutations = 128
# db=corpus
db['text'] = db['Headline'] + ' ' + db['Entire_News']
forest = get_forest(db, permutations)

In [8]:
pd.set_option('display.max_colwidth', -1) 

  pd.set_option('display.max_colwidth', -1)


In [9]:
num_recommendations = 10
title = db['Headline'][20225]
result = predict(title, db, permutations, num_recommendations, forest)
display('Querry Headline', db['Headline'][20225])
display('Top Recommendation(s) is(are)', result)

'Querry Headline'

"Jio To Launch A Low-Cost Laptop Called The 'JioBook' With 4G LTE Connectivity"

'Top Recommendation(s) is(are)'

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link,text
7201,2017-06-12 21:26:00,Technology,games,The Evil Within 2 announcement trailer,"Resident Evil creator Shinji Mikami returns with the follow-up to his latest horror IP The Evil Within, which was revealed by publisher Bethesda during its E3 2017 press conference.","Resident Evil creator Shinji Mikami returns with the follow-up to his latest horror IP The Evil Within, which was revealed by publisher Bethesda during its E3 2017 press conference.",admin,https://www.ibtimes.co.in/the-evil-within-2-announcement-trailer-730516,"The Evil Within 2 announcement trailer Resident Evil creator Shinji Mikami returns with the follow-up to his latest horror IP The Evil Within, which was revealed by publisher Bethesda during its E3 2017 press conference."
7169,2017-07-22 14:30:00,Technology,games,"Video game news round-up: Doomfist release, Telltale Games and a legendary Pokemon Go announcement","Your pick of the biggest gaming stories from the past seven days, this week including Overwatch, Telltale Games and Pokemon Go.","Your pick of the biggest gaming stories from the past seven days, this week including Overwatch, Telltale Games and Pokemon Go.",admin,https://www.ibtimes.co.in/video-game-news-round-up-doomfist-release-telltale-games-and-a-legendary-pokemon-go-announcement-735643,"Video game news round-up: Doomfist release, Telltale Games and a legendary Pokemon Go announcement Your pick of the biggest gaming stories from the past seven days, this week including Overwatch, Telltale Games and Pokemon Go."
15916,2021-03-09 18:49:00,World,Uk-news,Karachi Activists Have Mixed Feelings On Meghan Intv,"There were mixed reactions in Pakistan following a television interview with the UK's Prince Harry and Meghan Markle, in which the couple said they encountered racist attitudes and a lack of support that drove the Duchess to thoughts of suicide.","There were mixed reactions in Pakistan following a television interview with the UK's Prince Harry and Meghan Markle, in which the couple said they encountered racist attitudes and a lack of support that drove the Duchess to thoughts of suicide.Some women interviewed in Karachi who had gathered to march on International Women's day said not much would change following the explosive interview, while others said the reputation of Britain's Royal Family could be affected.In the two hour interview aired in the US on Sunday night, the couple painted a deeply unflattering picture of the life inside the royal household. (Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.)",Associated press television news,https://www.republicworld.com/world-news/uk-news/karachi-activists-have-mixed-feelings-on-meghan-intv.html,"Karachi Activists Have Mixed Feelings On Meghan Intv There were mixed reactions in Pakistan following a television interview with the UK's Prince Harry and Meghan Markle, in which the couple said they encountered racist attitudes and a lack of support that drove the Duchess to thoughts of suicide.Some women interviewed in Karachi who had gathered to march on International Women's day said not much would change following the explosive interview, while others said the reputation of Britain's Royal Family could be affected.In the two hour interview aired in the US on Sunday night, the couple painted a deeply unflattering picture of the life inside the royal household. (Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.)"
6771,2017-06-09 15:51:00,Technology,internet,Who are the DUP?,"The U.K. election has resulted in a hung parliament, and Conservatives are understood to be in talks with the Democratic Unionist Party. The DUP is the largest unionist political party in Northern Ireland and was founded in 1971, with the main political aim of keeping Northern Ireland part of the U.K.","The U.K. election has resulted in a hung parliament, and Conservatives are understood to be in talks with the Democratic Unionist Party. The DUP is the largest unionist political party in Northern Ireland and was founded in 1971, with the main political aim of keeping Northern Ireland part of the U.K.",admin,https://www.ibtimes.co.in/who-are-the-dup-730164,"Who are the DUP? The U.K. election has resulted in a hung parliament, and Conservatives are understood to be in talks with the Democratic Unionist Party. The DUP is the largest unionist political party in Northern Ireland and was founded in 1971, with the main political aim of keeping Northern Ireland part of the U.K."
13140,2016-08-17 16:30:00,Education,Grammar-and-vocabulary,Prefixing with 'very': 20 words to replace 'very',To add very or not to: 20 words that can do without the prefix 'very',"We all love to prefix 'very' to emphasise something; for instance, we say 'I am very happy' instead of 'I am jubilant'.Here' s a list of 20 such words where you can use a specific word instead of adding 'very'.1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.Now, this was a very exhaustive list!Read:10 Basic email etiquettes you should know",India today web desk,https://www.indiatoday.in/education-today/grammar-vocabulary/story/prefixing-very-322498-2016-05-09,"Prefixing with 'very': 20 words to replace 'very' We all love to prefix 'very' to emphasise something; for instance, we say 'I am very happy' instead of 'I am jubilant'.Here' s a list of 20 such words where you can use a specific word instead of adding 'very'.1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.Now, this was a very exhaustive list!Read:10 Basic email etiquettes you should know"
19061,2021-02-28 16:04:00,Sports,Tennis-news,Popyrin Of Australia Wins First ATP Title At Singapore Open,Alexei Popyrin claimed his first-ever ATP Tour title after come-from-behind win over Alexander Bublik at Singapore Tennis Open.,Alexei Popyrin of Australia claimed his first-ever ATP Tour title after come-from-behind win over Alexander Bublik at Singapore Tennis Open.(Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.),Associated press television news,https://www.republicworld.com/sports-news/tennis-news/popyrin-of-australia-wins-first-atp-title-at-singapore-open.html,Popyrin Of Australia Wins First ATP Title At Singapore Open Alexei Popyrin of Australia claimed his first-ever ATP Tour title after come-from-behind win over Alexander Bublik at Singapore Tennis Open.(Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.)
14873,2021-03-12 17:59:00,India,Economy,Industrial Production Contracts 1.6 Pc In January,"India's industrial production contracted by 1.6 per cent in January, official data showed on Friday.","India's industrial production contracted by 1.6 per cent in January, official data showed on Friday.\n\nAccording to the Index of Industrial Production (IIP) data, the manufacturing sector output contracted by 2 per cent in January 2021.\n\nMining output declined 3.7 per cent, while power generation grew 5.5 per cent in January.\n\nThe IIP had grown by 2.2 per cent in January 2020.(Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.)",Press trust of india,https://www.republicworld.com/india-news/economy/industrial-production-contracts-1-dot-6-pc-in-january.html,"Industrial Production Contracts 1.6 Pc In January India's industrial production contracted by 1.6 per cent in January, official data showed on Friday.\n\nAccording to the Index of Industrial Production (IIP) data, the manufacturing sector output contracted by 2 per cent in January 2021.\n\nMining output declined 3.7 per cent, while power generation grew 5.5 per cent in January.\n\nThe IIP had grown by 2.2 per cent in January 2020.(Disclaimer: This story has not been edited by www.republicworld.com and is auto-generated from a syndicated feed.)"
6843,2017-02-22 18:04:00,Technology,internet,Amazon Go: Tech giant unveils checkout-free stores,"How would you like to just walk into a shop, grab what you wanted and go? Well thats just what Amazons new technology lets you do. Amazon Go is a new kind of store featuring the world’s most advanced shopping technology. No lines, no checkout.","How would you like to just walk into a shop, grab what you wanted and go? Well thats just what Amazons new technology lets you do. Amazon Go is a new kind of store featuring the world’s most advanced shopping technology. No lines, no checkout.",admin,https://www.ibtimes.co.in/amazon-go-tech-giant-unveils-checkout-free-stores-717035,"Amazon Go: Tech giant unveils checkout-free stores How would you like to just walk into a shop, grab what you wanted and go? Well thats just what Amazons new technology lets you do. Amazon Go is a new kind of store featuring the world’s most advanced shopping technology. No lines, no checkout."
6812,2017-04-13 13:12:00,Technology,internet,What is a VPN and how do they work?,"Tech acronyms abound these days, and they don and #39;t always have a clear definition. Hopefully this will help sort out what a VPN (virtual private network) is and why theyre useful.","Tech acronyms abound these days, and they don and #39;t always have a clear definition. Hopefully this will help sort out what a VPN (virtual private network) is and why theyre useful.",admin,https://www.ibtimes.co.in/what-is-a-vpn-and-how-do-they-work-723028,"What is a VPN and how do they work? Tech acronyms abound these days, and they don and #39;t always have a clear definition. Hopefully this will help sort out what a VPN (virtual private network) is and why theyre useful."
13150,2016-08-17 15:41:00,Education,Grammar-and-vocabulary,9 brand names you are mispronouncing,Here are some brand names you may have been pronouncing wrong until now. Check them out!,"Are you brand conscious? Have you ever faced trouble trying to correctly pronounce a brand name? Have you been sniggered at when you failed to do so?Here are some brand names you may have been pronouncing wrong until now. Check them out!So, how many were you pronouncing wrong?Keep checking this space for more such tips.Read: 30 common English mistakes Indians make",India today web desk,https://www.indiatoday.in/education-today/grammar-vocabulary/story/mispronounced-brand-names-318773-2016-04-19,"9 brand names you are mispronouncing Are you brand conscious? Have you ever faced trouble trying to correctly pronounce a brand name? Have you been sniggered at when you failed to do so?Here are some brand names you may have been pronouncing wrong until now. Check them out!So, how many were you pronouncing wrong?Keep checking this space for more such tips.Read: 30 common English mistakes Indians make"


In [8]:
num_recommendations = 10
title = db['Entire_News'][20225]
result = predict(title, db, permutations, num_recommendations, forest)
display('Querry Headline', db['Headline'][20225])
display('Top Recommendation(s) is(are)', result)

'Querry Headline'

"Jio To Launch A Low-Cost Laptop Called The 'JioBook' With 4G LTE Connectivity"

'Top Recommendation(s) is(are)'

20225    Jio To Launch A Low-Cost Laptop Called The 'JioBook' With 4G LTE Connectivity                            
11334    25 years after Amul topical on Urmila Matondkar in Rangeela, divided India is outraged                   
10798    Haters, stop all your hate, the prices of Redmi K20 and Redmi K20 Pro are fine                           
527      A loss that keeps feet on ground                                                                         
10865    Best smartphones with 6000mAh battery to buy this February: Poco M3 joins Asus ROG Phone 3, Redmi 9 Power
2067     Apple iPhone 12 Pro Max review: The real Pro phone                                                       
10517    Poco M3 review: The smartphone to beat this year under Rs 12,000                                         
2134     MacBook Air (2020) review: Nobody does it better                                                         
2427     Redmi Note 9 Pro 5G vs Redmi Note 9 Pro: What has changed with the 5G v