In [1]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}

In [3]:
# function to request url
def request_url(url):
    time.sleep(2)
    try:
        response = requests.get(url, headers=headers, timeout=20)
        if response.status_code == 200:
            return response.text
        else:
            print(f'Failed to fetch {url}')
            return None
    except requests.exceptions.Timeout:
        print(f'Timeout error: {url}')
        return None

In [8]:
# function to parse html
def parse_html(to_parse):
    """
    It takes a string, then parse it.
    Finally, it retuns a soup object.
    """
    soup = BeautifulSoup(to_parse, 'html.parser')
    return soup 

In [9]:
all_urls = set()

# extracting data fron news section
for i in range(1,234):
    url = f'https://analyticsdrift.com/news/page/{i}/'
    soup = parse_html(request_url(url))
    data = soup.find_all('h3', class_ = 'entry-title td-module-title')
    for item in data:
        link = item.a.get('href')
        full_link = f'News {link}'
        all_urls.add(full_link)

In [10]:
 # extracting data from Data Science section
for i in range(1, 29):
    url = f'https://analyticsdrift.com/data-science/page/{i}/'
    soup = parse_html(request_url(url))
    data = soup.find_all('h3', class_ = 'entry-title td-module-title')
    for item in data:
        link = item.a.get('href')
        full_link = f'Datascience {link}'
        all_urls.add(full_link)

In [11]:
# extracting data from Developer section
for i in range(1, 6):
    url = f'https://analyticsdrift.com/developer/page/{i}/'
    soup = parse_html(request_url(url))
    data = soup.find_all('h3', class_ = 'entry-title td-module-title')
    for item in data:
        link = item.a.get('href')
        full_link = f'Developer {link}'
        all_urls.add(full_link)

In [12]:
# extracting data from miscellaneous section
for i in range(1, 13):
    url = f'https://analyticsdrift.com/miscellaneous/page/{i}/'
    soup = parse_html(request_url(url))
    data = soup.find_all('h3', class_ = 'entry-title td-module-title')
    for item in data:
        link = item.a.get('href')
        full_link = f'Miscellaneous {link}'
        all_urls.add(full_link)

In [13]:
# converting set to list because iteration not possible on set
print(len(all_urls))
all_urls = list(all_urls)

3358


In [14]:
# function to fetch article heading
def fetch_article_heading(url):
    try:
        html_text = request_url(url)
        if html_text:
            soup = parse_html(html_text)
            heading_element = soup.find('h1', class_='tdb-title-text')
            if heading_element:
                return heading_element.text.strip()
    except Exception as e:
        print(f"Error fetching or parsing heading from {url}: {e}")
    return None

In [15]:
# function to fetch article date
def fetch_article_date(url):
    try:
        html_text = request_url(url)
        if html_text:
            soup = parse_html(html_text)
            time_tag = soup.find('time', class_='entry-date updated td-module-date')
            if time_tag:
                return time_tag['datetime'].split('T')[0]
    except Exception as e:
        print(f"Error fetching or parsing date from {url}: {e}")
    return None

In [16]:
# function to fetch article author
def fetch_article_author(url):
    try:
        html_text = request_url(url)
        if html_text:
            soup = parse_html(html_text)
            author_tag = soup.find('a', class_='tdb-author-name')
            if author_tag:
                return author_tag.text.strip()
    except Exception as e:
        print(f"Error fetching or parsing author from {url}: {e}")
    return None 

In [17]:
# function to extrach all necessary links
def extract_links(url):
    base_url = 'https://analyticsdrift.com/'
    external_links = set()
    internal_links = set()
    broken_links = set()
    new_tab_links = set()
    same_tab_links = set()
    sponsored_links = set()

    html_text = request_url(url)
    if html_text:
        soup = parse_html(html_text)
        data = soup.find_all('div', class_='tdb-block-inner td-fix-index')
        
        for div in data:
            paragraphs = div.find_all('p')
            for paragraph in paragraphs:
                links = paragraph.find_all('a', href=True)
                for link in links:
                    href = link.get('href')
                    if href:
                        if base_url in href:
                            internal_links.add(href)
                        else:
                            external_links.add(href)

                        try:
                            response = requests.head(href)
                            if response.status_code != 200:
                                broken_links.add(href)
                                # If broken link found, exit the loop
                                break
                            else:
                                content_type = response.headers.get('Content-Type', '').lower()
                                if 'text' in content_type or 'html' in content_type:
                                    inner_html_text = requests.get(href).text
                                    inner_soup = parse_html(inner_html_text)
                                    if inner_soup.find('a', target='_blank'):
                                        new_tab_links.add(href)
                                    else:
                                        same_tab_links.add(href)
                                    if inner_soup.find('a', rel='nofollow'):
                                        sponsored_links.add(href)
                                else:
                                    print(f"Skipping binary content for {href}")

                        except requests.RequestException as e:
                            print(f"HTTP request error for {href}: {e}")
                            broken_links.add(href)

    return {
        'external_link': ",".join(external_links),
        'internal_link': ",".join(internal_links),
        'broken_link': ",".join(broken_links),
        'new_tab_link': ",".join(new_tab_links),
        'same_tab_link': ",".join(same_tab_links),
        'sponsored_link': ",".join(sponsored_links)
    }
                           

In [18]:
article_heading = []
article_url = [] 
article_date = []
external_links = []
internal_links = []
broken_links = []
new_tab_links = []
same_tab_links = []
sponsored_links = []
section_name = []
author_name = []

In [32]:
count = 200 
for url in all_urls[200 : 400]:
    section_name.append(url.split(' ')[0])
    article_url.append(url.split(' ')[1])
    article_heading.append(fetch_article_heading(url.split(' ')[1]))
    article_date.append(fetch_article_date(url.split(' ')[1]))
    author_name.append(fetch_article_author(url.split(' ')[1]))
    
    links_info = extract_links(url.split(' ')[1])
    external_links.append(links_info['external_link'])
    internal_links.append(links_info['internal_link'])
    broken_links.append(links_info['broken_link'])
    new_tab_links.append(links_info['new_tab_link'])
    same_tab_links.append(links_info['same_tab_link'])
    sponsored_links.append(links_info['sponsored_link'])
    print(f"Extracted links number {count} and remaining {len(all_urls) - count}")
    count +=1
    

Extracted links number 200 and remaining 3158
Extracted links number 201 and remaining 3157


In [28]:
print(len(article_heading))
print(len(article_url))
print(len(article_date))
print(len(external_links))
print(len(internal_links))
print(len(broken_links))
print(len(new_tab_links))
print(len(same_tab_links))
print(len(sponsored_links))
print(len(section_name))
print(len(author_name))

200
200
200
200
200
200
200
200
200
200
200


In [29]:
data = {
    'Article Heading': article_heading,
    'Article Url': article_url,
    'Article Section': section_name,
    'Article Date': article_date,
    'Author Name': author_name,
    'External Links': external_links,
    'Lnternal Llinks': internal_links,
    'Broken Links': broken_links,
    'Links Open in new tab': new_tab_links,
    'Links open in same tab' : same_tab_links,
    'Sponsored_Links': sponsored_links
   
}

In [30]:
df = pd.DataFrame(data)
df.head(4)

Unnamed: 0,Article Heading,Article Url,Article Section,Article Date,Author Name,External Links,Lnternal Llinks,Broken Links,Links Open in new tab,Links open in same tab,Sponsored_Links
0,Apple’s Ali Farhadi Appointed CEO of Allen Ins...,https://analyticsdrift.com/apples-ali-farhadi-...,News,2023-06-21,Sahil Pawar,https://allenai.org/,https://analyticsdrift.com/tag/artificial-inte...,,https://analyticsdrift.com/microsoft-announces...,"https://allenai.org/,https://analyticsdrift.co...",https://analyticsdrift.com/microsoft-announces...
1,Gupshup acquires Conversational AI provider As...,https://analyticsdrift.com/gupshup-acquires-co...,News,2022-04-21,Dipayan Mitra,https://www.gupshup.io/resources/press-release...,https://analyticsdrift.com/south-korea-telecom...,https://www.gupshup.io/resources/press-release...,https://analyticsdrift.com/south-korea-telecom...,,https://analyticsdrift.com/south-korea-telecom...
2,Amazon launches Free AWS Builder Online Series...,https://analyticsdrift.com/amazon-launches-fre...,News,2021-12-23,Dipayan Mitra,https://aws.amazon.com/events/builders-online-...,https://analyticsdrift.com/meta-develops-ai-th...,,https://analyticsdrift.com/meta-develops-ai-th...,,https://analyticsdrift.com/meta-develops-ai-th...
3,Elon Musk might Start his own Social Media wit...,https://analyticsdrift.com/elon-musk-might-sta...,News,2022-04-01,Dipayan Mitra,https://t.co/aPS9ycji37,https://analyticsdrift.com/tesla-to-accept-dog...,https://t.co/aPS9ycji37,https://analyticsdrift.com/tesla-to-accept-dog...,,https://analyticsdrift.com/tesla-to-accept-dog...


In [31]:
df.to_csv('analyticsdrift.csv', index=False)