## The pipeline:
- Fetch the input data which is available from GSoC and parse it into a pandas dataframe.
- Classify the ideas url into Google Doc, GitHub Issues, single web page, and others.
- Using one of the three methods, classify the webpages type into single webpage that contain the ideas or a webpage with links to the ideas details.
- extract the ideas details for each type

In [2]:
import requests
import pandas as pd
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from difflib import SequenceMatcher
from requests_html import HTMLSession
import warnings
import os

warnings.filterwarnings("ignore")



In [5]:
# Fetch the input data which is available from GSoC and parse it into a pandas dataframe.
def process_organizations(organizations):
    data = []
    for org in organizations:
        organization_data = {
            'name': org.get('name', ''),
            'description': org.get('description', ''),
            'tagline': org.get('tagline', ''),
            'website_url': org.get('website_url', ''),
            'source_code': org.get('source_code', ''),
            'ideas_link': org.get('ideas_link', ''),
            'contributor_guidance_url': org.get('contributor_guidance_url', ''),
            'license': org.get('license', ''),
            'logo_url': org.get('logo_url', ''),
            'categories': ', '.join(org.get('categories', [])),
            'tech_tags': ', '.join(org.get('tech_tags', [])),
            'topic_tags': ', '.join(org.get('topic_tags', [])),
            'contact_links': ', '.join([link['value'] for link in org.get('contact_links', [])]),
            'direct_comm_methods': ', '.join([method['value'] for method in org.get('direct_comm_methods', [])]),
            'social_comm_methods': ', '.join([method['value'] for method in org.get('social_comm_methods', [])])
        }
        data.append(organization_data)
    return pd.DataFrame(data)

# URL for GSoC 2024 organizations
url = "https://summerofcode.withgoogle.com/api/program/2024/organizations/"
response = requests.get(url)
if response.status_code == 200:
    organizations = response.json()
else:
    raise requests.exceptions.RequestException("Failed to fetch data")

gsoc_df = process_organizations(organizations)

gsoc_df.head()

In [6]:
gsoc_df.head()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
0,mlpack,"mlpack is an intuitive, fast, and flexible C++...","a fast, flexible machine learning library",https://www.mlpack.org,https://github.com/mlpack/mlpack,https://github.com/mlpack/mlpack/wiki/SummerOf...,https://github.com/mlpack/mlpack/wiki/Google-S...,BSD-3-Clause,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Artificial Intelligence","c++, C++ template metaprogramming","machine learning, embedded, ai, deep learning,...",https://www.mlpack.org/community.html#page-cha...,https://www.mlpack.org/community.html#page-cha...,https://www.mlpack.org/
1,Uramaki LAB,The RUXAILAB is an open-source organization de...,The User Experience LAB based on IA,https://github.com/ruxailab,https://github.com/ruxailab,https://github.com/ruxailab/gsoc/blob/main/ide...,https://github.com/ruxailab/gsoc/tree/main,MIT,https://summerofcode.withgoogle.com/media/org/...,"End user applications, Artificial Intelligence","python, javascript, html, css, Firebase","Usability, User Evaluation, Heuristic Evaluati...","https://github.com/ruxailab/RUXAILAB/issues, u...","https://github.com/ruxailab/RUXAILAB/issues, u...",https://github.com/ruxailab/RUXAILAB/discussions
2,DBpedia,DBpedia is a crowd-sourced community effort to...,Global and Unified Access to Knowledge Graphs.,https://www.dbpedia.org/,https://github.com/dbpedia/,https://forum.dbpedia.org/tag/gsoc2024-ideas,https://docs.google.com/document/d/e/2PACX-1vQ...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Data","python, javascript, java, scala, rdf","semantic web, linked data, knowledge graph, da...","https://dbpedia.slack.com/, https://forum.dbpe...","https://dbpedia.slack.com/, https://forum.dbpe...","https://www.dbpedia.org/blog/, https://twitter..."
3,ArduPilot,ArduPilot is the world's most widely used open...,World's most advanced autonomous vehicle software,https://ardupilot.org/,https://github.com/ArduPilot,https://ardupilot.org/dev/docs/gsoc-ideas-list...,https://ardupilot.org/dev/docs/gsoc.html,GPL-3.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, End user applications","python, lua, c++, pixhawk","robotics, Drone, autonomous vehicle, UGV, unma...",https://discord.com/channels/67403967856286106...,https://discord.com/channels/67403967856286106...,https://www.facebook.com/groups/ArduPilot.org/...
4,libvirt,Libvirt is a library and toolkit providing abs...,Virtualization abstraction library,https://libvirt.org/,https://gitlab.com/libvirt/libvirt,https://wiki.libvirt.org/Google_Summer_of_Code...,https://libvirt.org/hacking.html,LGPL-2.1,https://summerofcode.withgoogle.com/media/org/...,"End user applications, Infrastructure and cloud","kvm, hypervisor, qemu, lxc","virtualization, library","https://libvirt.org/contact.html#email, https:...","https://libvirt.org/contact.html#email, https:...",


In [4]:
gsoc_df.describe()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
count,195,195,195,195,195,195,184,195,195,195,195,195,195,195,195.0
unique,195,195,195,195,195,195,184,21,195,64,192,195,195,195,178.0
top,Synfig,Synfig is a 2D open-source animation software....,Open-source 2D animation software,https://synfig.org,https://github.com/synfig/synfig,https://synfig-docs-dev.readthedocs.io/en/late...,https://synfig-docs-dev.readthedocs.io/en/late...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Data","python, c++","2d/3d graphics, animation, vector graphics",https://forums.synfig.org/t/gsoc-2024-google-s...,https://forums.synfig.org/t/gsoc-2024-google-s...,
freq,1,1,1,1,1,1,1,46,1,13,2,1,1,1,18.0


In [5]:
gsoc_df.to_csv('./sheets/gsoc_organizations.csv', index=False)

In [4]:
gsoc_df_copy = gsoc_df.copy()
# Classify the ideas url into Google Doc, GitHub Issues, single web page, and others.
def classify_ideas_link(ideas_link):
    parsed_url = urlparse(ideas_link)
    if "docs.google.com" in parsed_url.netloc:
        return "Google Doc"
    elif "github.com" in parsed_url.netloc and "/issues" in parsed_url.path:
        return "GitHub Issues"
    elif parsed_url.scheme in ["http", "https"]:
        return "Webpage"
    else:
        return "Other"
    
gsoc_df_copy['ideas_link_type'] = gsoc_df_copy['ideas_link'].apply(classify_ideas_link)

gsoc_df_copy.value_counts('ideas_link_type')

ideas_link_type
Webpage          177
Google Doc        15
GitHub Issues      3
Name: count, dtype: int64

In [4]:
def get_webpage_info(url):
    session = HTMLSession()
    response = session.get(url)
    if response.status_code == 200:
        try:
            # count number of words
            words = response.html.text
            word_count = len(re.findall(r'\w+', words))
            # count number of links
            links = [link for link in response.html.absolute_links]
            # count number of links with the same domain as the ideas_link
            domain = urlparse(url).netloc
            same_domain_links = [link for link in links if urlparse(link).netloc == domain]
            same_domain_link_count = len(same_domain_links)
            return word_count, len(links), same_domain_link_count, links
        except Exception as e:
            print(f"Error parsing HTML: {e}")
            return 0, 0, 0, []
    else:
        print(f"Failed to fetch webpage: {response.status_code}")
        return 0, 0, 0, []

def process_ideas_links(ideas_link):
    if ideas_link and ("http" in ideas_link or "https" in ideas_link):
        word_count, link_count, same_domain_link_count, links = get_webpage_info(ideas_link)
        return word_count, link_count, same_domain_link_count, links
    else:
        print(f"Invalid ideas link: {ideas_link}")
        return 0, 0, 0, []


gsoc_df_copy['ideas_word_count'], gsoc_df_copy['ideas_link_count'], gsoc_df_copy['same_domain_link_count'], gsoc_df_copy['ideas_links'] = zip(*gsoc_df_copy['ideas_link'].apply(process_ideas_links))

gsoc_df_copy[['name', 'ideas_link', 'ideas_link_type', 'ideas_word_count', 'ideas_link_count', 'same_domain_link_count', 'ideas_links']].head()

Failed to fetch webpage: 403


Unnamed: 0,name,ideas_link,ideas_link_type,ideas_word_count,ideas_link_count,same_domain_link_count,ideas_links
0,Synfig,https://synfig-docs-dev.readthedocs.io/en/late...,Webpage,1005,35,24,"[https://www.sphinx-doc.org/, https://synfig-d..."
1,Eclipse Foundation,https://gitlab.eclipse.org/eclipsefdn/emo-team...,Webpage,248,10,7,[https://gitlab.eclipse.org/users/sign_in?redi...
2,Purr Data,https://git.purrdata.net/jwilkes/summer-of-cod...,Webpage,450,23,19,[https://git.purrdata.net/jwilkes/summer-of-co...
3,National Resource for Network Biology (NRNB),https://github.com/nrnb/GoogleSummerOfCode/issues,GitHub Issues,892,163,147,[https://github.com/nrnb/GoogleSummerOfCode/is...
4,PostgreSQL,https://wiki.postgresql.org/wiki/GSoC_2024,Webpage,2298,40,20,"[https://github.com/urfave/cli, https://wiki.p..."


In [6]:
gsoc_df_copy[['name', 'ideas_link', 'ideas_link_type', 'ideas_word_count', 'ideas_link_count', 'same_domain_link_count', 'ideas_links']].sort_values('ideas_word_count', ascending=True).head(10)

Unnamed: 0,name,ideas_link,ideas_link_type,ideas_word_count,ideas_link_count,same_domain_link_count,ideas_links
102,Drupal Association,https://www.drupal.org/project/issues/gsoc?cat...,Webpage,0,0,0,[]
171,DatenLord,https://datenlord.github.io/xline-home/#/GSoC,Webpage,1,0,0,[]
38,BRL-CAD,https://opencax.github.io/project-proposals/,Webpage,38,0,0,[]
99,freifunk,https://projects.freifunk.net,Webpage,76,7,1,"[https://blog.freifunk.net, https://projects.f..."
27,Jitsi,https://github.com/jitsi/gsoc-ideas/blob/maste...,Webpage,104,9,9,[https://github.com/jitsi/gsoc-ideas/blob/mast...
182,NumFOCUS,https://github.com/numfocus/gsoc/blob/master/2...,Webpage,112,26,26,[https://github.com/numfocus/gsoc/blob/master/...
154,QC-Devs,https://qcdevs.org/join/qcdevs_gsoc/#project-i...,Webpage,144,17,7,"[https://jekyllrb.com, https://qcdevs.org/, ht..."
83,HumanAI,https://humanai.foundation/,Webpage,168,14,4,[https://matrix.to/#/#humanai-foundation:gitte...
31,MBDyn,https://public.gitlab.polimi.it/DAER/mbdyn/-/w...,Webpage,228,20,20,[https://public.gitlab.polimi.it/DAER/mbdyn/-/...
112,Machine Learning for Science (ML4SCI),https://ml4sci.org/,Webpage,240,23,11,"[https://ml4sci.org/activities/gsoc2023.html, ..."


In [10]:
gsoc_df_copy.to_csv('./sheets/gsoc_organizations_ideas.csv', index=False)

In [7]:
gsoc_df_exclusions = gsoc_df.copy()
# try with excluding some elements and focus on the main content
def get_webpage_info_excluding_elements(url):
    session = HTMLSession()
    response = session.get(url)
    if response.status_code == 200:
        try:
            soup = BeautifulSoup(response.html.html, 'html.parser')

            # Remove header, nav, footer, and aside elements
            for element in soup.select('header, nav, footer, aside'):
                element.decompose()

            words = soup.get_text()
            word_count = len(re.findall(r'\w+', words))

            # Extract links only from remaining elements
            links = [link.get('href') for link in soup.find_all('a', href=True)]

            # count number of links with the same domain as the ideas_link
            domain = urlparse(url).netloc
            same_domain_links = [link for link in links if urlparse(link).netloc == domain]
            same_domain_link_count = len(same_domain_links)

            return word_count, len(links), same_domain_link_count, links

        except Exception as e:
            print(f"Error parsing HTML: {e}")
            return 0, 0, 0, []
    else:
        print(f"Failed to fetch webpage: {response.status_code}")
        return 0, 0, 0, []
    
def process_ideas_links_excluding_elements(ideas_link):
    if ideas_link and ("http" in ideas_link or "https" in ideas_link):
        word_count, link_count, same_domain_link_count, links = get_webpage_info_excluding_elements(ideas_link)
        return word_count, link_count, same_domain_link_count, links
    else:
        print(f"Invalid ideas link: {ideas_link}")
        return 0, 0, 0, []
    
gsoc_df_exclusions['ideas_word_count'], gsoc_df_exclusions['ideas_link_count'], gsoc_df_exclusions['same_domain_link_count'], gsoc_df_exclusions['ideas_links'] = zip(*gsoc_df_exclusions['ideas_link'].apply(process_ideas_links_excluding_elements))

gsoc_df_exclusions[['name', 'ideas_link', 'ideas_word_count', 'ideas_link_count', 'same_domain_link_count', 'ideas_links']].sort_values('ideas_word_count', ascending=True).head(10)

Failed to fetch webpage: 403


Unnamed: 0,name,ideas_link,ideas_word_count,ideas_link_count,same_domain_link_count,ideas_links
38,BRL-CAD,https://opencax.github.io/project-proposals/,0,0,0,[]
102,Drupal Association,https://www.drupal.org/project/issues/gsoc?cat...,0,0,0,[]
171,DatenLord,https://datenlord.github.io/xline-home/#/GSoC,1,0,0,[]
99,freifunk,https://projects.freifunk.net,8,1,0,[https://github.com/freifunk/projects]
131,OpenAstronomy,https://openastronomy.org/gsoc/gsoc2024/#/proj...,11,0,0,[]
56,Electron,https://electronhq.notion.site/Electron-Google...,13,0,0,[]
31,MBDyn,https://public.gitlab.polimi.it/DAER/mbdyn/-/w...,18,1,1,[https://public.gitlab.polimi.it/10260632]
1,Eclipse Foundation,https://gitlab.eclipse.org/eclipsefdn/emo-team...,24,3,0,"[https://www.eclipse.org/legal/privacy.php, ht..."
43,Open Climate Fix,https://docs.google.com/document/d/1vawwkgROZj...,38,4,1,[https://docs.google.com/document/?usp=docs_we...
189,Zendalona,https://docs.google.com/document/d/1oVDcIq1_Tx...,40,4,1,[https://docs.google.com/document/?usp=docs_we...


In [9]:
gsoc_df_exclusions.to_csv('./sheets/gsoc_organizations_exclusions.csv', index=False)

In [6]:
gsoc_df_diff = gsoc_df.copy()
# try with diff mechanisim
def get_webpage_info_differ_links(ideas_url, main_url):
    try:
        session = HTMLSession()
        ideas_response = session.get(ideas_url, verify=False)
        main_response = session.get(main_url, verify=False)

        if ideas_response.status_code == 200 and main_response.status_code == 200:
            try:
                # Get links from both pages
                ideas_links = set(link for link in ideas_response.html.absolute_links)
                main_links = set(link for link in main_response.html.absolute_links)
                
                # Get the difference between the two sets of links
                unique_ideas_links = ideas_links - main_links

                # Count number of words
                words = ideas_response.html.text
                word_count = len(re.findall(r'\w+', words))

                # Count number of links with the same domain as the ideas_link
                domain = urlparse(ideas_url).netloc
                same_domain_links = [link for link in unique_ideas_links if urlparse(link).netloc == domain]
                same_domain_link_count = len(same_domain_links)
                return word_count, len(unique_ideas_links), same_domain_link_count, list(unique_ideas_links)
            except Exception as e:
                print(f"Error parsing HTML: {e}")
                return 0, 0, 0, []
        else:
            print(f"Failed to fetch webpages: {ideas_response.status_code}, {main_response.status_code}")
            return 0, 0, 0, []
    except Exception as e:
        print(f"Failed to fetch webpages: {e}")
        return 0, 0, 0, []
    
def process_ideas_links_differ_links(row):
    ideas_link = row['ideas_link']
    if ideas_link and ("http" in ideas_link or "https" in ideas_link):
        word_count, link_count, same_domain_link_count, links = get_webpage_info_differ_links(ideas_link, row['website_url'])
        return word_count, link_count, same_domain_link_count, links
    else:
        print(f"Invalid ideas link: {ideas_link}")
        return 0, 0, 0, []
    
gsoc_df_diff['ideas_word_count'], gsoc_df_diff['ideas_link_count'], gsoc_df_diff['same_domain_link_count'], gsoc_df_diff['ideas_links'] = zip(*gsoc_df_diff.apply(process_ideas_links_differ_links, axis=1))

gsoc_df_diff[['name', 'ideas_link', 'ideas_word_count', 'ideas_link_count', 'same_domain_link_count', 'ideas_links']].sort_values('ideas_word_count', ascending=True).head(10)

Failed to fetch webpages: 403, 403
Failed to fetch webpages: 200, 400
Failed to fetch webpages: HTTPSConnectionPool(host='datenlord.io', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


Unnamed: 0,name,ideas_link,ideas_word_count,ideas_link_count,same_domain_link_count,ideas_links
162,Micro Electronics Research Lab - UITU,https://github.com/merledu/Google-Summer-of-Co...,0,0,0,[]
102,Drupal Association,https://www.drupal.org/project/issues/gsoc?cat...,0,0,0,[]
171,DatenLord,https://datenlord.github.io/xline-home/#/GSoC,0,0,0,[]
38,BRL-CAD,https://opencax.github.io/project-proposals/,38,0,0,[]
99,freifunk,https://projects.freifunk.net,76,6,1,[https://github.com/freifunk/MoU/blob/master/F...
27,Jitsi,https://github.com/jitsi/gsoc-ideas/blob/maste...,104,9,9,[https://github.com/jitsi/gsoc-ideas/blob/mast...
182,NumFOCUS,https://github.com/numfocus/gsoc/blob/master/2...,112,26,26,[https://github.com/numfocus/gsoc/blob/master/...
154,QC-Devs,https://qcdevs.org/join/qcdevs_gsoc/#project-i...,144,8,1,"[https://github.com/theochem/AtomDB/issues/21,..."
83,HumanAI,https://humanai.foundation/,168,0,0,[]
31,MBDyn,https://public.gitlab.polimi.it/DAER/mbdyn/-/w...,228,19,19,[https://public.gitlab.polimi.it/DAER/mbdyn/-/...


In [7]:
gsoc_df_diff.to_csv('./sheets/gsoc_organizations_diff.csv', index=False)

In [8]:
gsoc_df_diff.describe(include='all')

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods,ideas_word_count,ideas_link_count,same_domain_link_count,ideas_links
count,195,195,195,195,195,195,184,195,195,195,195,195,195,195,195.0,195.0,195.0,195.0,195
unique,195,195,195,195,195,195,184,21,195,64,192,195,195,195,178.0,,,,187
top,Synfig,Synfig is a 2D open-source animation software....,Open-source 2D animation software,https://synfig.org,https://github.com/synfig/synfig,https://synfig-docs-dev.readthedocs.io/en/late...,https://synfig-docs-dev.readthedocs.io/en/late...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Data","python, c++","2d/3d graphics, animation, vector graphics",https://forums.synfig.org/t/gsoc-2024-google-s...,https://forums.synfig.org/t/gsoc-2024-google-s...,,,,,[]
freq,1,1,1,1,1,1,1,46,1,13,2,1,1,1,18.0,,,,9
mean,,,,,,,,,,,,,,,,10369.230769,82.061538,58.415385,
std,,,,,,,,,,,,,,,,24087.158829,137.598498,129.954737,
min,,,,,,,,,,,,,,,,0.0,0.0,0.0,
25%,,,,,,,,,,,,,,,,1192.5,19.0,2.0,
50%,,,,,,,,,,,,,,,,2348.0,42.0,25.0,
75%,,,,,,,,,,,,,,,,5103.5,101.0,64.5,


In [20]:

# List to store failed links
failed_links = []

def download_google_docs_as_pdf(data, directory):
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    for i in data.index:
        url, name = data.loc[i]
        # Extract the document ID from the URL
        doc_id = url.split("/d/")[1].split("/")[0]

        # Determine whether the document is a Google Doc or a Google Slide
        if "document" in url:
            # Check if the document is published to the web
            if "/pub" in url:
                export_url = f"https://docs.google.com/document/d/e/{doc_id}/pub?export=pdf"
            else:
                export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=pdf"
        elif "presentation" in url:
            export_url = f"https://docs.google.com/presentation/d/{doc_id}/export/pdf"
        else:
            print(f"Unsupported URL format: {url}")
            failed_links.append(url)
            continue

        # Send a GET request to the export URL
        response = requests.get(export_url)

        # Check if the request was successful
        if response.status_code == 200:
            # Write the content to a PDF file
            file_name = os.path.join(directory, f'{name}_{i + 1}.pdf')
            with open(file_name, 'wb') as file:
                file.write(response.content)
            print(f'Downloaded: {file_name}')
        else:
            print(f'Failed to download document with URL: {url}')
            failed_links.append(url)


data = gsoc_df_copy[['ideas_link', 'name']][gsoc_df_copy['ideas_link_type'] == "Google Doc"]
directory = "./pdfs"
# Download the Google Docs as PDF files
download_google_docs_as_pdf(data, directory)

# Print the failed links
print("Failed to download the following links:")
for link in failed_links:
    print(link)


Downloaded: ./pdfs/Robolectric_4.pdf
Downloaded: ./pdfs/Zendalona_5.pdf
Downloaded: ./pdfs/webpack_27.pdf
Failed to download document with URL: https://docs.google.com/document/d/e/2PACX-1vQRqIN0nL7NP13agF9fJiaaAN9o_TuHDldwKw9XvjJR2Cu_kHy0I6Htz6nb_5IGIRpCYeMZeoyDsOwG/pub
Downloaded: ./pdfs/Mautic_55.pdf
Downloaded: ./pdfs/INCF_57.pdf
Failed to download document with URL: https://docs.google.com/document/d/1vWnJhxWJU4oNsZNheKrP6sx5ZPkOzumwdnL6IIRbDP4/edit?usp=sharing
Failed to download document with URL: https://docs.google.com/document/d/e/2PACX-1vSdJAq5vzu2JOiB_nmrLtOMItZ0LPa4botgyr7RPLziNJ888anpfV6no12Vw8QHSFxHp5nsIazbQF5N/pub
Downloaded: ./pdfs/Learning Equality_81.pdf
Downloaded: ./pdfs/Open Climate Fix_99.pdf
Downloaded: ./pdfs/AnkiDroid_108.pdf
Downloaded: ./pdfs/Internet Archive_118.pdf
Downloaded: ./pdfs/Global Alliance for Genomics and Health_122.pdf
Downloaded: ./pdfs/Chromium_125.pdf
Downloaded: ./pdfs/FOSSASIA_141.pdf
Failed to download the following links:
https://docs.goo

In [26]:
# for the failed links, parse the content as html and extract the text and save it into text file. this is just for the failed links 
# and name the file with the first word in the title of the page
from bs4 import BeautifulSoup
import requests

def extract_text_from_html(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the HTML, removing extra whitespace and newlines
        text = '\n'.join([line.strip() for line in soup.get_text().splitlines() if line.strip()])

        return text
    else:
        print(f'Failed to fetch the webpage: {response.status_code}')
        return None

# Example usage
for i,url in enumerate(failed_links):
    text = extract_text_from_html(url)
    if text:
        with open(f'page_content{i}.txt', 'w', encoding='utf-8') as file:
            file.write(text)
        print('Text extracted and saved to page_content.txt')



Text extracted and saved to page_content.txt
Text extracted and saved to page_content.txt
Text extracted and saved to page_content.txt
