In [8]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlretrieve

In [9]:
def get_libgen_url(book_name:str):
    url = f"https://libgen.is/search.php?&res=100&req={book_name}&phrase=1&view=simple&column=def&sort=year&sortmode=DESC"
    return url

In [16]:
def extract_pdf_links(html_content):
    """
    Extract download links for PDF versions of books from HTML table.
    
    Args:
        html_content (str): HTML content containing the book table
        
    Returns:
        list: List of dictionaries containing book title, extension, and download links
    """
    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all rows in the table
    rows = soup.find_all('tr', bgcolor=True)  # Using bgcolor attribute to find data rows
    
    # Store PDF links
    pdf_links = []
    
    # Process each row
    for row in rows:
        # Extract file extension
        extension_cell = row.find_all('td')[8]  # 9th column has the extension
        extension = extension_cell.text.strip()
        
        # Only process PDF files
        if extension == 'pdf':
            # Extract book title
            title_cell = row.find_all('td')[2]
            title = title_cell.text.strip()
            
            # Extract download links
            link_cells = row.find_all('td')[9:11]  # 10th and 11th columns have the download links
            links = []
            for link_cell in link_cells:
                a_tag = link_cell.find('a')
                if a_tag and 'href' in a_tag.attrs and ("books.ms" in a_tag['href']:
                    links.append(a_tag['href'])
                    break
            
            # Add to our results
            pdf_links.append({
                'title': title,
                'extension': extension,
                'links': links
            })
    
    return pdf_links

In [17]:
def download_books(books_to_download:list):
    for book in books_to_download:
        html = requests.get(book.link)
        soup = BeautifulSoup(html_content, 'html.parser')
        download_link = soup.find("a", string="GET")["href"]
        urlretrieve(get_link, book.title + ".pdf")
        
        

In [18]:
url = get_libgen_url(book_name = "Cosmos, Carl Sagan")

In [19]:
search_page = requests.get(url)

In [20]:
links = extract_pdf_links(search_page.content)

In [21]:
links

[{'title': 'Cosmos 9789731114712, 9731114718',
  'extension': 'pdf',
  'links': ['http://books.ms/main/2A2963E0CADEED180A0EDA77C4540829',
   'http://libgen.li/ads.php?md5=2A2963E0CADEED180A0EDA77C4540829']},
 {'title': 'Star Stuff: Carl Sagan and the Mysteries of the Cosmos 1596439602, 9781596439603',
  'extension': 'pdf',
  'links': ['http://books.ms/main/EA1AC2464E06E6366A35C628F3DE7A97',
   'http://libgen.li/ads.php?md5=EA1AC2464E06E6366A35C628F3DE7A97']},
 {'title': 'Cosmos (Chinese Edition) 7206072070, 9787206072079',
  'extension': 'pdf',
  'links': ['http://books.ms/main/5E852C295701C0CC19A687F2C1F92859',
   'http://libgen.li/ads.php?md5=5E852C295701C0CC19A687F2C1F92859']},
 {'title': 'Cosmos',
  'extension': 'pdf',
  'links': ['http://books.ms/main/421FC6E79FE0BB58D0AAA4C476AA9057',
   'http://libgen.li/ads.php?md5=421FC6E79FE0BB58D0AAA4C476AA9057']},
 {'title': 'Cosmos 9780345331359, 0345331354',
  'extension': 'pdf',
  'links': ['http://books.ms/main/494AF965C8F206D07ACECFA10

In [1]:
from urllib.request import urlretrieve
import requests

In [29]:
urlretrieve(get_link, "physics_of_the_impossible" + ".pdf")

('physics_of_the_impossible.pdf', <http.client.HTTPMessage at 0x7b88ff708320>)

In [18]:
links[0]['links'][0]

'http://books.ms/main/18030181E6D4F40D8FED29D77979FA04'

In [2]:
d_page = requests.get('http://books.ms/main/421FC6E79FE0BB58D0AAA4C476AA9057')

In [5]:
soup = BeautifulSoup(d_page.content, 'html.parser')

In [6]:
get_link = soup.find("a", string="GET")["href"]

In [7]:
get_link

'https://download.books.ms/main/1157000/421fc6e79fe0bb58d0aaa4c476aa9057/Carl%20Sagan%20-%20Cosmos-Gradiva%20%282001%29.pdf'