In [1]:
pip install requests beautifulsoup4 mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [12]:
import requests
from bs4 import BeautifulSoup
import mysql.connector

In [13]:

def meta_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('title').text if soup.find('title') else ''
    description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else ''
    return title, description

def social_media_links_extraction(soup):
    social_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if 'facebook.com' in href or 'twitter.com' in href or 'linkedin.com' in href or 'instagram.com' in href:
            social_links.append(href)
    return ','.join(social_links)

def tech_stack_extraction(soup):
    tech_stack = []
    for script in soup.find_all('script', src=True):
        src = script['src']
        if 'jquery' in src:
            tech_stack.append('jQuery')
        elif 'bootstrap' in src:
            tech_stack.append('Bootstrap')
    return ','.join(tech_stack)

def payment_gateways_extraction(soup):
    gateways = []
    if 'paypal.com' in soup.text:
        gateways.append('PayPal')
    if 'stripe.com' in soup.text:
        gateways.append('Stripe')
    if 'razorpay.com' in soup.text:
        gateways.append('Razorpay')
    return ','.join(gateways)

def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    meta_title, meta_description = meta_info(url)
    social_media_links = social_media_links_extraction(soup)
    tech_stack = tech_stack_extraction(soup)
    payment_gateways = payment_gateways_extraction(soup)
    return {
        'url': url,
        'meta_title': meta_title,
        'meta_description': meta_description,
        'tech_stack': tech_stack,
        'payment_gateways': payment_gateways,
        'social_media_links': social_media_links
    }


In [14]:
def store_data_in_db(data):
    conn = mysql.connector.connect(
        host='localhost',
        user='root',
        password='mysql',
        database='scraping'
    )
    cursor = conn.cursor()
    sql = """INSERT INTO site_info (url, meta_title, meta_description, tech_stack, payment_gateways, social_media_links)
             VALUES (%s, %s, %s, %s, %s, %s)"""
    val = (data['url'], data['meta_title'], data['meta_description'], data['tech_stack'], data['payment_gateways'], data['social_media_links'])
    cursor.execute(sql, val)
    conn.commit()
    cursor.close()
    conn.close()


In [15]:
def main(websites):
    for website in websites:
        data = scrape_website(website)
        store_data_in_db(data)


In [None]:
sites = [
        '' #websites link
]
main(sites)
        