In [79]:
import requests
from bs4 import BeautifulSoup
import re
import csv

# Validate Wikipedia URL
def validate_wiki_link(link):

    # with new wikiwand links
    wiki_regex = re.compile(r'^https?://(www\.)?wikiwand\.com/en/[^ ]+$')
    if not wiki_regex.match(link):
        raise ValueError("The provided link is not a valid Wikipedia link.")

# Extract Wikipedia links from the page
def extract_wiki_links(url, visited_links):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    wiki_links = []

    for a in soup.find_all('a', href=True):
        href = a['href']
        if ':' not in href:                                        
            full_url = f"{url}{href}"        
#             print(full_url)
            if full_url not in visited_links:
                wiki_links.append(full_url)
                visited_links.add(full_url)
            if len(wiki_links) == 10:
                break

    return wiki_links

# Scrape the Wikipedia links up to n cycles
def scrape_wiki_links(start_link, n):
    validate_wiki_link(start_link)
    
    visited_links = set()
    all_links = []
    
    
    current_links = extract_wiki_links(start_link, visited_links)
#     visited_links.add(start_link)
    all_links.extend(current_links)
    
#     print(all_links)
#     print("All Links")
    
    for cycle in range(n-1):
        next_links = []
        for link in current_links:
            new_links = extract_wiki_links(link, visited_links)
            next_links.extend(new_links)
            all_links.extend(new_links)
        current_links = next_links
        
#     print(all_links)
    return all_links

# Save results to a CSV file
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Link'])
        for link in data:
            writer.writerow([link])


# Main function
def main():
    start_link = input("Enter a valid Wikipedia link: ")
    try:
        n = int(input("Enter the number of cycles (1-3): "))
        if n < 1 or n > 3:
            raise ValueError("The number of cycles must be between 1 and 3.")
    except ValueError as e:
        print(e)
        return
    
    try:
        all_links = scrape_wiki_links(start_link, n)
        print(f"Total links found: {len(all_links)}")
#         print(set(all_links))
        print(f"Unique links found: {len(set(all_links))}")
#         print(all_links)

        save_to_csv(all_links, 'wiki_links.csv')
        print("Links have been saved to wiki_links.csv")
                    
    except ValueError as e:
        print(e)

if __name__ == "__main__":
    main()


Enter a valid Wikipedia link: https://www.wikiwand.com/en/Canada
Enter the number of cycles (1-3): 2
Total links found: 110
Unique links found: 110
Links have been saved to wiki_links.csv
