In [None]:
!pip install requests_html

### Import Libraries

In [19]:
import requests
import urllib
import pandas as pd
import numpy as np
from requests_html import HTMLSession
import re
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Scrape the url website

In [5]:
def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)


In [6]:
def google_search(query):
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.in/search?q=" + query)
    
    results = response.html.find('.tF2Cxc')

    output = []
    
    for result in results:

        item = {
            'title': result.find('h3', first=True).text,
            'link': result.find('.yuRUbf a', first=True).attrs['href'],
            'text': result.find('.IsZvec', first=True).text
        }
        
        output.append(item)
    # print(output)
    return output


In [7]:
def get_google_result(query):
    results = google_search(query)
    desc = ''
    for res in results:
    desc = desc + ' ' + res['text']
    return desc


In [10]:
text = get_google_result('https://www.ebay.com/')
text

" Buy & sell electronics, cars, clothes, collectibles & more on eBay, the world's online marketplace. Top brands, low prices & free shipping on many items. Product description. Buy and sell on the go with eBay. Explore discount offers on best-selling ... Developer info. mobilehelp@ebay.com · http://mobile.ebay.com · More apps by this developer\xa0...\nRating: 4.3 · \u200e26,510 reviews Information and news about eBay Inc. (Nasdaq: EBAY), a global commerce leader that connects millions of buyers and sellers in 190 markets around the world. Interested in joining us? Get in touch with our Careers Team. Contact Careers. Investor Relations Contacts. For financial inquiries, please contact our Investor\xa0... eBay, San Jose, CA. 11000564 likes · 23971 talking about this. The latest things, the everyday things, and the I-can't-believe-this-deal things. Find..."

### Cleaning and Filtering the text

In [11]:
def strip_url(string):
    text = re.sub(r'http\S+', '', string, flags=re.MULTILINE)
    return text

def strip_email(string):
    text = re.sub("\S*@\S*\s?","",string)
    return text

def strip_punctuation(string):
    text = re.sub(r'[^\w\s]', ' ', string)
    return text

def strip_numbers(string):
    text = re.sub("\d+", "", string)
    return text

def strip_whitespaces(string):
    text =  re.sub(' +', ' ', string)
    return text

def remove_multiline(string):
    text ="".join(string.splitlines())
    return text

def make_lowercase(string):
    text = string.lower()
    return text

In [12]:
def clean_text(text):
    text = strip_numbers(text)
    text = strip_url(text)
    text = strip_email(text)
    text = strip_punctuation(text)
    text = strip_whitespaces(text)
    text = remove_multiline(text)
    text = make_lowercase(text)
    token = word_tokenize(text)
    text = ' '.join([word for word in token if not word in stopwords.words()])
    return text

Import the keyword dataset

In [13]:
df = pd.read_excel('/content/drive/MyDrive/Rapidken/keywords_data.xlsx')
df.head()

Unnamed: 0,Keyword_No,E-Commerce,Video/Streaming/Images,Listing,Article/Blog/News
0,1,shipping,videos,listing,headlines
1,2,item,video,discover,news
2,3,items,music,connects,stories
3,4,shopping,movies,connect,politics
4,5,price,movie,listings,post


## Convert the keyword values into list array


In [49]:
e_commerce_sites = df['E-Commerce'].values
streaming_sites = df['Video/Streaming/Images'].values
listing_sites = df['Listing'].values
news_blogging_sites = df['Article/Blog/News'].values

### Compare and predict the category

In [50]:
def predict(url):
    text = clean_text(get_google_result(url))
    text = word_tokenize(text)

    category1_count = 0 
    category2_count = 0 
    category3_count = 0
    category4_count = 0

    for word in text:   
        if(word in e_commerce_sites):
            category1_count = category1_count + 1
        if(word in streaming_sites):
            category2_count = category2_count + 1
        if(word in listing_sites):
            category3_count = category3_count + 1
        if(word in news_blogging_sites):
            category4_count = category4_count + 1

    total = category1_count + category2_count + category3_count + category4_count
    category_counts = np.array([category1_count,category2_count,category3_count,category4_count])
    print('Website Category Probability:\n')
    print('E-Commerce Website           {:.2f}%'.format((category1_count/total)*100))
    print('Streaming/Images Website     {:.2f}%'.format((category2_count/total)*100))
    print('Listing Website              {:.2f}%'.format((category3_count/total)*100))
    print('News/Blogging Website        {:.2f}%\n'.format((category4_count/total)*100))

    if(category_counts.argmax() == 0):
        print("It is an E-Commerce Website")
    elif(category_counts.argmax() == 1):
        print("It is a Streaming/Image Website")
    elif(category_counts.argmax() == 2):
        print("It is a Listing Website")
    elif(category_counts.argmax() == 3):
        print("It is an News/Blogging Website")
        1

In [51]:
predict('https://www.ndtv.com/')


Website Category Probability:

E-Commerce Website           0.00%
Streaming/Images Website     20.00%
Listing Website              8.00%
News/Blogging Website        72.00%

It is an News/Blogging Website


In [44]:
predict('https://www.shopclues.com/')

Website Category Probability:

E-Commerce Website           75.00%
Streaming/Images Website     0.00%
Listing Website              7.14%
News/Blogging Website        17.86%

It is a Listing Website


In [40]:
predict('https://medium.com/')

Website Category Probability:

E-Commerce Website           21.74%
Streaming/Images Website     8.70%
Listing Website              26.09%
News/Blogging Website        43.48%

It is an News/Blogging Website


In [34]:

predict('https://www.netflix.com/')

Website Category Probability:

E-Commerce Website           10.71%
Streaming/Images Website     75.00%
Listing Website              7.14%
News/Blogging Website        7.14%

It is a Streaming/Image Website
1


In [48]:
predict('https://www.youtube.com/')

Website Category Probability:

E-Commerce Website           11.11%
Streaming/Images Website     44.44%
Listing Website              16.67%
News/Blogging Website        27.78%

It is a Streaming/Image Website
