In [None]:
pip install requests beautifulsoup4 mysql-connector-python selenium

In [None]:
!pip install webdriver_manager

In [None]:
pip install langcodes

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver 
import mysql.connector
import re
import langcodes 

In [None]:

#____________________________________________________________________________________________________________________________
#
#                                               EXTRACTING WEBSITE INFO
#____________________________________________________________________________________________________________________________





#_______________________________________________Initializing web driver______________________________________________________

def extract_website_info(url):

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')                 
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    driver = webdriver.Chrome(options=options)
 
                                                     # Prepend 'https://' if not present in the URL
    if not url.startswith('http://') and not url.startswith('https://'):
        url = f'https://{url}'
        
    try:
        driver.get(url)
    except Exception as e:
        print(f"Error loading URL {url}: {e}")
        driver.quit()
        return None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    
    
#________________________________________Extract Meta Title and Description__________________________________________________

    meta_title = soup.find('title').text if soup.find('title') else None
    meta_description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else None

    
#__________________________________________Extract Social Media Links________________________________________________________

    social_media_links = []
    index = 1
    added_platforms = set()      # Set to track added platforms
     
    platforms = [
        'facebook', 'twitter', 'instagram', 'linkedin', 'youtube', 'x.com',   
        'whatsapp', 'reddit', 'play.google', 'telegram', 'tiktok', 'pinterest', 'github'
    ]
    
    for link in soup.find_all('a', href=True):
        link_url = link['href'].lower()
        for platform in platforms:
            if platform in link_url and platform not in added_platforms:
                social_media_links.append(f"{index}. {link['href']}")
                index += 1
                added_platforms.add(platform)      # Add platform to set
                
    if not social_media_links:
        social_media_links.append("Not Specified")
        

#____________________________________________Extract Tech Stack____________________________________________________________

    tech_stack = []
    
#------------------Detect MVC frameworks--------------------

    mvc_frameworks = [
        'angular', 'react', 'vue', 'ember', 'backbone', 'knockout', 'svelte', 
        'meteor', 'aurelia', 'marionette', 'mithril', 'riot', 'polymer', 
        'alpine', 'next', 'nuxt', 'blazor', 'asp.net'
    ]
    
    detected_mvc_frameworks = []
            
    for framework in mvc_frameworks:
        if soup.find_all('script', src=lambda src: src and framework in src.lower()):
            detected_mvc_frameworks.append(framework.capitalize())
            
    if detected_mvc_frameworks:
        tech_stack.append(f"MVC Framework ({', '.join(detected_mvc_frameworks)})")

        
#------------------Detect CMS platforms-----------------------

    cms_keywords = {
        'WordPress': ['wp-content', 'wp-include', 'wp-json'],
        'Shopify': ['shopify', 'cdn.shopify'],
        'Joomla': ['joomla', 'joomla!'],
        'Magento': ['magento', 'mage'],
        'Drupal': ['drupal', 'sites/default'],
        'Wix': ['wix.com', 'wixsite'],
        'Squarespace': ['squarespace', 'sqspcdn'],
        'PrestaShop': ['prestashop', 'ps_version'],
        'TYPO3': ['typo3', 't3lib'],
        'Blogger': ['blogger', 'blogspot'],
        'BigCommerce': ['bigcommerce', 'bc-sf-filter'],
        'Ghost': ['ghost', 'gh-content'],
        'HubSpot': ['hubspot', 'hs-scripts'],
        'ExpressionEngine': ['exp:channel', 'expressionengine'],
        'Craft CMS': ['craft', 'craftcms'],
        'Sitecore': ['sitecore', 'sc_webedit'],
        'Concrete5': ['concrete5', 'ccm_app']
    }
    
    detected_cms = []
    for cms, keywords in cms_keywords.items():
        if any(keyword in str(soup).lower() for keyword in keywords):
            detected_cms.append(cms)
            
    if detected_cms:
        tech_stack.append(f'CMS ({", ".join(detected_cms)})')
        
        
#----- --------Detect JavaScript libraries and frameworks--------------------   

    js_libraries = [
        'jquery', 'bootstrap', 'd3', 'underscore', 'lodash', 'moment', 
        'chart.js','three.js', 'anime.js', 'axios', 'rxjs', 'handlebars', 
        'mustache', 'p5.js', 'paper.js', 'raphael', 'pixi.js', 'gsap', 
        'velocity.js','dojo', 'ext.js', 'alpine'
    ]
    
    detected_js_libraries = []
    
    for lib in js_libraries:
        if soup.find_all('script', src=lambda src: src and lib in src.lower()):
            detected_js_libraries.append(lib.capitalize())
            
    if detected_js_libraries:
        tech_stack.append(f"JavaScript Library ({', '.join(detected_js_libraries)})")
        
        
#---------------------------Detect CSS frameworks------------------------------

    css_frameworks = [
        'bootstrap', 'foundation', 'bulma', 'tailwind', 'materialize', 'semantic ui', 
        'pure', 'milligram', 'uikit', 'spectre.css', 'skeleton', 'basscss', 'siimple', 
        'susy', 'water.css', 'tachyons', 'primer', 'chota', 'min.css', 'blaze', 'wing', 
        'rscss', 'yaml', 'ink', 'cutestrap', 'baseline', 'metroui', 'topcoat', 'solarcss'
    ]
    
    detected_css_frameworks = []
    
    for framework in css_frameworks:
        if soup.find_all('link', rel='stylesheet', href=lambda href: href and framework in href.lower()):
            detected_css_frameworks.append(framework.capitalize())
            
    if detected_css_frameworks:
        tech_stack.append(f"CSS Framework ({', '.join(detected_css_frameworks)})")
        
        
#---------------------Detect Backend frameworks and languages----------------------

    backend_tech = [
        'django', 'flask', 'express', 'laravel', 'spring', 'ruby on rails', 'node.js', 
        'php', 'java', 'python', 'ruby', 'asp.net', 'dotnet', 'go', 'scala', 'elixir', 
        'perl', 'rust', 'haskell', 'swift', 'typescript', 'clojure', 'r', 'kotlin'
    ]
    
    detected_backend_tech = []
    
    for tech in backend_tech:
        if soup.find_all('script', src=lambda src: src and tech in src.lower()):
            detected_backend_tech.append(tech.capitalize())
    
    if detected_backend_tech:
        tech_stack.append(f"Backend Technology ({', '.join(detected_backend_tech)})")
        

#__________________________________________Extract Payment Gateways_________________________________________________________                 
            
    payment_gateways = []

    page_text = soup.get_text()                                                          # Extracting all text from the page

    
    payment_patterns = [
        r'\bpaypal\b', r'\bstripe\b', r'\brazorpay\b',r'\bsquare\b',r'\bauthorize.net\b',
        r'\bworldpay\b',r'\b2checkout\b',r'\bbraintree\b',r'\badyen\b',r'\bvenmo\b',                # List of common paymentS
        r'\bapple pay\b',r'\bgoogle pay\b',r'\bpayoneer\b',r'\bskrill\b',r'\bwechat pay\b',
        r'\balipay\b',r'\bpaytm\b',r'\bzelle\b',r'\bklarna\b',r'\bpaytm\b',r'\btransferwise\b',
        r'\bneteller\b', r'\bsquare cash\b'
    ]

    detected_gateways = set()

    for pattern in payment_patterns:
        if re.search(pattern, page_text, re.IGNORECASE):                                                # Search for payment 
            detected_gateways.add(re.search(pattern, page_text, re.IGNORECASE).group(0).capitalize())

    payment_gateways = list(detected_gateways) if detected_gateways else ['Not specified']
    

#___________________________________________Extract Website Language________________________________________________________

    lang_code = soup.find('html')['lang'] if soup.find('html') and 'lang' in soup.find('html').attrs else None
    website_language = langcodes.get(lang_code).language_name() if lang_code else None
    

#_______________________________________Website Category Based on Content___________________________________________________

    categories = {
        'E-commerce': ['shop', 'cart', 'checkout', 'product', 'sale', 'buy', 'ecommerce', 'online store', 'shopping', 'retail'],
        'Blog': ['blog', 'post', 'article', 'comment', 'author', 'subscribe'],
        'News': ['news', 'breaking', 'headline', 'report', 'journal', 'media'],
        'Education': ['course', 'lesson', 'education', 'school', 'university', 'college', 'study', 'lecture'],
        'Technology': ['tech', 'software', 'gadget', 'programming', 'developer', 'IT'],
        'Health': ['health', 'medicine', 'doctor', 'hospital', 'clinic', 'fitness', 'wellness'],
        'Finance': ['finance', 'bank', 'investment', 'loan', 'insurance', 'account', 'money', 'stock'],
        'Travel': ['travel', 'trip', 'tour', 'destination', 'flight', 'hotel', 'vacation'],
        'Food': ['food', 'recipe', 'restaurant', 'cooking', 'cuisine', 'dish'],
        'Real Estate': ['real estate', 'property', 'home', 'apartment', 'house', 'land', 'broker'],
        'Entertainment': ['entertainment', 'movie', 'film', 'music', 'game', 'concert', 'celebrity'],
        'Sports': ['sports', 'fitness', 'exercise', 'athlete', 'tournament', 'championship'],
        'Art': ['art', 'painting', 'sculpture', 'gallery', 'artist', 'creative', 'design'],
        'Government': ['government', 'official', 'public', 'policy', 'law', 'regulation'],
        'Automotive': ['automotive', 'car', 'vehicle', 'auto', 'truck', 'motor', 'parts'],
        'Charity': ['charity', 'nonprofit', 'donate', 'volunteer', 'foundation', 'cause'],
        'Music': ['music', 'band', 'concert', 'album', 'song', 'musician'],
        'Photography': ['photography', 'photo', 'photographer', 'image', 'picture', 'camera'],
        'Fashion': ['fashion', 'style', 'clothing', 'apparel', 'trend', 'designer'],
        'Gaming': ['gaming', 'video game', 'esports', 'gamer', 'console', 'streaming'],
        'Law': ['law', 'legal', 'attorney', 'court', 'justice', 'lawyer'],
        'Science': ['science', 'research', 'scientist', 'biology', 'physics', 'chemistry']
    }

    
    text_content = ' '.join([tag.get_text().lower() for tag in soup.find_all(['title', 'meta', 'h1', 'h2', 'h3', 'p'])])

    category_scores = {category: 0 for category in categories}           # Initializing a score dictionary

    
    for category, keywords in categories.items():
        for keyword in keywords:                                # Counting the occurrences of each keyword in the text content
            category_scores[category] += text_content.count(keyword)

    primary_category = max(category_scores, key=category_scores.get)    # Selecting the category with the highest score

    
    if category_scores[primary_category] == 0:
        category = 'General'
    else:
        category = primary_category
        
#____________________________________________Returning All the Values_______________________________________________________   

    return {
        'url': url,
        'social_media_links': '\n\n\n'.join(social_media_links),
        'tech_stack': ',\n\n'.join(tech_stack),
        'meta_title': meta_title,
        'meta_description': meta_description,
        'payment_gateways': ','.join(payment_gateways),
        'website_language': website_language,
        'category': category
    }

#____________________________________Connecting And Saving to MySQl Database _______________________________________________

def save_to_database(data):
    conn = mysql.connector.connect(
        host='localhost',
        user='root',
        password='@jaykumar_A04',   
        database='web_scraping'
    )
    cursor = conn.cursor()

    cursor.execute("""
        INSERT INTO website_info (url, social_media_links, tech_stack, meta_title, meta_description, payment_gateways, website_language, category)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """, (data['url'], data['social_media_links'], data['tech_stack'], data['meta_title'], data['meta_description'], data['payment_gateways'], data['website_language'], data['category']))

    conn.commit()
    cursor.close()
    conn.close()
    
#__________________________________________________Main Function_______________________________________________________________

def main():
    urls = ['www.udacity.com']        # Paste your urls
    for url in urls:
        try:
            data = extract_website_info(url)
            if data:
                save_to_database(data)
        except Exception as e:
            print(f"Error processing {url}: {e}")

if __name__ == '__main__':
    main()
    
#____________________________________________________________________________________________________________________________
#
#                                                    THE END
#____________________________________________________________________________________________________________________________


In [None]:
www.coursera.org
www.khanacademy.org
www.codecademy.com
ocw.mit.edu
www.duolingo.com
ed.ted.com
www.skillshare.com
www.linkedin.com/learning
finance.yahoo.com
www.investopedia.com
www.bloomberg.com
www.cnbc.com
www.fool.com
www.marketwatch.com
www.seekingalpha.com
www.morningstar.com
www.nerdwallet.com
www.ft.com
www.espn.com
www.cbssports.com
bleacherreport.com
www.si.com
www.foxsports.com
www.nbcsports.com
sports.yahoo.com
www.nfl.com
www.nba.com
www.mlb.com
www.cnn.com
www.bbc.com/news
www.reuters.com
www.nytimes.com
www.theguardian.com
www.washingtonpost.com
www.aljazeera.com
www.npr.org
www.apnews.com
www.usatoday.com
techcrunch.com
www.theverge.com
www.wired.com
www.cnet.com
mashable.com
gizmodo.com
www.engadget.com
arstechnica.com
slashdot.org
www.tomshardware.com
www.webmd.com
www.mayoclinic.org
www.cdc.gov
www.healthline.com
www.nih.gov
www.medicalnewstoday.com
www.everydayhealth.com
www.nhs.uk
www.health.com
www.who.int
www.spotify.com
www.apple.com/music
www.soundcloud.com
www.pandora.com
www.youtube.com/music
www.deezer.com
www.tidal.com
www.bandcamp.com
www.mixcloud.com
www.billboard.com/music
www.autotrader.com
www.cars.com
www.edmunds.com
www.kbb.com
www.caranddriver.com
www.motortrend.com
www.autoblog.com
www.thedrive.com
www.autocar.co.uk
www.hemmings.com
www.allrecipes.com
www.foodnetwork.com
www.epicurious.com
www.bonappetit.com
www.seriouseats.com
www.yummly.com
www.thekitchn.com
www.tasteofhome.com
www.delish.com
www.eater.com
www.vogue.com
www.elle.com
www.harpersbazaar.com
www.thecut.com
www.gq.com
www.glamour.com
www.refinery29.com
www.instyle.com
www.cosmopolitan.com
www.marieclaire.com