<a href="https://colab.research.google.com/github/AvanBoeckel/extract-working-links/blob/main/Extract_working_links.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import requests


In [17]:
def extract_links_and_dois(text):
    url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    doi_pattern = r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b'
    urls = re.findall(url_pattern, text, re.IGNORECASE)
    dois = re.findall(doi_pattern, text, re.IGNORECASE)
    doi_urls = [f"https://doi.org/{doi}" for doi in dois]
    return list(set(urls + doi_urls))

def check_link_availability(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/114.0.0.0 Safari/537.36"
        )
    }

    try:
        with requests.Session() as session:
            response = session.get(url, headers=headers, allow_redirects=False, timeout=5)
            status_code = response.status_code
            if 200 <= status_code < 400:
                redirect_info = response.headers.get("Location")
                return True, f"Status: {status_code}" + (f", Redirect: {redirect_info}" if redirect_info else "")
            else:
                return False, f"Status: {status_code}"
    except requests.exceptions.RequestException as e:
        return False, f"RequestException: {type(e).__name__} - {e}"

def main():
    filepath = "file.txt"

    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    links = extract_links_and_dois(content)
    results = {}

    for link in links:
        status, detail = check_link_availability(link)
        results[link] = (status, detail)

    print("\n📄 Results:\n")
    for link, (status, detail) in results.items():
        status_str = "✅ Available" if status else "❌ Unavailable"
        print(f"{link}\n→ {status_str} ({detail})\n")

if __name__ == "__main__":
    main()


📄 Results:

https://doi.org/,
→ ❌ Unavailable (Status: 400)

https://doi.org/
→ ✅ Available (Status: 301, Redirect: https://www.doi.org/)

https://doi.org/10.11648/j.pbs.20160506.13
→ ❌ Unavailable (Status: 404)

https://doi.org/10.57539/telesjournal.44.0_101
→ ✅ Available (Status: 302, Redirect: https://www.jstage.jst.go.jp/article/telesjournal/44/0/44_440009/_article)

https://doi.org/10.1177/…
→ ❌ Unavailable (Status: 404)

https://doi.org/10.1080/13670050.2016.1248373
→ ✅ Available (Status: 302, Redirect: https://www.tandfonline.com/doi/full/10.1080/13670050.2016.1248373)

https://doi.org/10.1080/01434632.2020.1749644
→ ✅ Available (Status: 302, Redirect: https://www.tandfonline.com/doi/full/10.1080/01434632.2020.1749644)

https://doi.org/10.3389/fpsyg.2024.1376076
→ ✅ Available (Status: 302, Redirect: https://www.frontiersin.org/articles/10.3389/fpsyg.2024.1376076/full)

