In [3]:
import pandas as pd
import numpy as np
import requests
import time
import urllib
from bs4 import BeautifulSoup

In [4]:
# Analytics drift website url
url = 'https://analyticsdrift.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}

In [5]:
def request_url(link):     
    time.sleep(3) 
    response = requests.get(link, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch URL: {link}")
        return None

In [6]:
# function to parse the html
def parse_html(to_parse):
    """
    It takes a string, then parse it.
    Finally, it retuns a soup object.
    """
    soup = BeautifulSoup(to_parse, 'html.parser')
    return soup

In [7]:
def all_sections(main_url):
   soup = parse_html(request_url(url))
   section_list = []
   ul = soup.find('div' , class_ = "td_block_inner td-fix-index")
   for li in ul.find_all('li'):
       section_list.append(li.a.get('href'))
   # Remove the section which we will not consider
   # Like Services , Contact us, About us....
   section_list = section_list[1:5]    
   return section_list

In [8]:
# checking the sections url
all_section = all_sections(url)
all_section

['https://analyticsdrift.com/news/',
 'https://analyticsdrift.com/data-science/',
 'https://analyticsdrift.com/developer/',
 'https://analyticsdrift.com/miscellaneous/']

In [9]:
# Check for valid urls
def is_valid_url(url):
    try:
        urllib.request.urlopen(url)
        return True
    except Exception as e:
        return False

Every section has multiple pages with a different number of pages, so we have to extract article links for each section separately.

News Section: This section has 233 pages.

Data Science Section: This section has 28 pages.

Developer Section: This section has 5 pages.

Miscellaneous Section: This section has 12 pages.

In [51]:
def extract_urls_from_section(section_url, num_pages):
    all_urls = set()
    for i in range(1, num_pages + 1):
        page_url = f"{section_url}/page/{i}/" 
        html = request_url(page_url)
        if html:
            soup = parse_html(html)
            data = soup.find_all('div', class_='td-module-meta-info')
            for item in data:
                link = item.a.get('href')
                if link and is_valid_url(link):
                    all_urls.add(link)
        else:
            break
    return all_urls

In [52]:
def extract_article_urls():
    all_urls = set()

    # Extracting article URLs from News Section
    news_section_url = f"{url}/news"
    all_urls.update(extract_urls_from_section(news_section_url, 1))

    # Extracting article URLs from Data Science Section
    data_science_section_url = f"{url}/data-science"
    all_urls.update(extract_urls_from_section(data_science_section_url, 1))

    # Extracting article URLs from Developer Section
    developer_section_url = f"{url}/developer"
    all_urls.update(extract_urls_from_section(developer_section_url, 1))

    # Extracting article URLs from Miscellaneous Section
    miscellaneous_section_url = f"{url}/miscellaneous"
    all_urls.update(extract_urls_from_section(miscellaneous_section_url, 1))

    return all_urls

In [53]:
all_url = extract_article_urls()

Failed to fetch URL: https://analyticsdrift.com/mistral-ais-new-llm-model-outperforms-gpt-3-model//news/page/1/
Failed to fetch URL: https://analyticsdrift.com/mistral-ais-new-llm-model-outperforms-gpt-3-model//data-science/page/1/
Failed to fetch URL: https://analyticsdrift.com/mistral-ais-new-llm-model-outperforms-gpt-3-model//developer/page/1/
Failed to fetch URL: https://analyticsdrift.com/mistral-ais-new-llm-model-outperforms-gpt-3-model//miscellaneous/page/1/


In [13]:
# function for extracring article heading from each article
def article_heading(article_url):
    soup = parse_html(request_url(article_url))
    article_heading = soup.find('h1', class_='tdb-title-text').text.strip()
    return article_heading

In [14]:
# storing all the artcle heading in a list 
article_headings = []
for url in all_url:
    article_headings.append(article_heading(url))

In [15]:
print(len(article_headings))

60


In [22]:
# function for extracting internal links
def extract_internal_links(page_url):
    internal_links = set()  

    # Fetching the HTML content of the page
    soup = parse_html(request_url(page_url))

    # Find all <a> tags with href attribute
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']

        # Skip empty links
        if not link:
            continue

        # Check if the link is internal
        if link.startswith('http://') or link.startswith('https://'):
            if link.startswith('https://analyticsdrift.com/'):
                internal_links.add(link)  

    return list(internal_links) 

In [17]:
# Function for extracting external links 
def extract_external_links(page_url):
    external_links = set()  

    # Fetching the HTML content of the page
    soup = parse_html(request_url(page_url))

    # Find all <a> tags with href attribute
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']

        # Skip empty links
        if not link:
            continue

        # Check if the link is external
        if link.startswith('http://') or link.startswith('https://'):
            if not link.startswith('https://analyticsdrift.com/'):
                external_links.add(link)  

    return list(external_links)  



In [18]:
# storing all the external links and internal links into the list
external_links =[]
internal_links = []

for url in all_url:
    external_links.append(extract_external_links(url))
    internal_links.append(extract_internal_links(url))

print(len(external_links))
print(len(internal_links)) 

60
60


In [19]:
# Function to extract broken links from the articles
def find_broken_links(page_url):
    broken_links = []

    # Fetching the HTML content of the page
    soup = parse_html(request_url(page_url))    
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']

        # Skip empty links
        if not link:
            continue

        # Check if the link is reachable
        try:
            response = requests.head(link)
            if response.status_code != 200:
                broken_links.append(link)
        except Exception as e:
            # If any exception occurs, consider the link broken
            broken_links.append(link)
    
    if len(broken_links)==0:
        broken_links.append(None)        

    return broken_links

In [23]:
# storing all the broken link into a  list variable
broken_links = []
for url in all_url:
    broken_links.append(find_broken_links(url)) 

In [24]:
# checking length of the broken link list variable
print(len(broken_links))

60


In [27]:
def not_new_tabs(page_url):
    not_new_tab = set()
    soup = parse_html(request_url(page_url))
    links = soup.find_all('a')
    
    # Loop through each link and check its attributes
    for link in links:
        href = link.get('href')
        target = link.get('target')
        
        # Check if the link has a target attribute set to "_blank"
        if target == '_blank':
            not_new_tab.add(f"Link: {href} opens in a new tab")
        else:
            not_new_tab.add(f"Link: {href} does not open in a new tab")

    return list(not_new_tab)             

In [32]:
not_new_tab = []
for url in all_url:
    not_new_tab.append(not_new_tabs(url))

In [33]:
#checking length of the vaiable
print(len(not_new_tab))

60


In [28]:
# funtion to find sponsored links
def find_sponserd_link(page_url):
    sponserd_link = set()
    soup = parse_html(request_url(page_url))
    
    for a_tag in soup.find_all('a', rel='sponsored'):
        link = a_tag['href']
        sponserd_link.add(link)
    
    if len(sponserd_link)==0:
        sponserd_link.add(None)
    
    return list(sponserd_link)    

In [29]:
# storing all sponserd links in a list variable
sponserd_links = []
for url in all_url:
    sponserd_links.append(find_sponserd_link(url))

In [30]:
print(len(sponserd_links))

60


In [31]:
sl_no = [i for i in range(len(all_url))]

In [41]:
# createing dictionary to store all data

data = {
    'SL No' : sl_no,    
    'Title' : article_headings,
    'External Link' : external_links,
    'Internal Link:' : internal_links,
    'Broken Links' : broken_links,
    'Not New Tab' : not_new_tab,
    'Sponsored' : sponserd_links
} 

In [42]:
# crating dataframe using dictionary variable
df = pd.DataFrame(data) 

In [44]:
df.set_index('SL No', inplace=True)

In [45]:
df.head()

Unnamed: 0_level_0,Title,External Link,Internal Link:,Broken Links,Not New Tab,Sponsored
SL No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Weis Wave: Revolutionizing Market Analysis,[https://api.whatsapp.com/send?text=Weis+Wave%...,"[https://analyticsdrift.com/developer/, https:...","[#, #, #login-form, https://www.linkedin.com/c...",[Link: https://analyticsdrift.com/wp-content/u...,[None]
1,"OpenAI Open Sources Triton 1.0, A GPU Programm...","[https://analyticsdrift.com, https://twitter.c...",[https://analyticsdrift.com/weis-wave-revoluti...,"[#, #, #login-form, https://www.linkedin.com/c...",[Link: /openai-open-sources-triton-1-0-a-gpu-p...,[None]
2,"Optimus Gen-2, Second Generation Humanoid Robo...","[https://discord.com/invite/GkKUqKZYaG, https:...",[https://analyticsdrift.com/is-grok-the-first-...,"[#, #, #login-form, https://www.linkedin.com/c...",[Link: https://analyticsdrift.com/swaayatt-rob...,[None]
3,Navigating UK Healthcare in the Digital Age,"[https://discord.com/invite/GkKUqKZYaG, https:...",[https://analyticsdrift.com/weis-wave-revoluti...,"[#, #, #login-form, https://www.linkedin.com/c...",[Link: https://analyticsdrift.com/author/ratan...,[None]
4,"Microsoft Unveils Phi-2, a Small Language Mode...","[https://discord.com/invite/GkKUqKZYaG, https:...",[https://analyticsdrift.com/meta-unveils-open-...,"[#, #, #login-form, https://www.linkedin.com/c...",[Link: https://analyticsdrift.com/swaayatt-rob...,[None]
