In [1]:
from pathlib import Path

input_file = Path('../input.txt')
with input_file.open('r') as file:
    urls = [line.strip() for line in file if line.strip()]
print(f"Number of URLs loaded from input file: {len(urls)}")

Number of URLs loaded from input file: 1062


In [2]:
from pathlib import Path
import re

output_dir = Path('../output')
file_pattern = re.compile(r'.*\.t(\d+)\..*')

existing_ids = set()
for file in output_dir.iterdir():
    if not file.name.startswith('.'):
        match = file_pattern.match(file.name)
        if match:
            existing_ids.add(match.group(1))

print(f"Number of existing files: {len(existing_ids)}")

Number of existing files: 151


In [3]:
import re

# Define the regex pattern to extract the ID from the URL
url_pattern = re.compile(r'.*t=(\d+).*')

# Filter out URLs that have already been downloaded and remove duplicates
filtered_urls = set()
for url in urls:
    match = url_pattern.match(url)
    if match:
        url_id = match.group(1)
        if url_id not in existing_ids:
            filtered_urls.add(url)

# Update the urls variable with the filtered list
urls = list(filtered_urls)

print(f"Number of unique URLs to be processed: {len(urls)}")


Number of unique URLs to be processed: 788


In [4]:
assert urls

In [5]:
from playwright.async_api import async_playwright
from dotenv import load_dotenv
import os
import random
import asyncio
import time
from tqdm.notebook import tqdm
import aiofiles

load_dotenv()
DOMAIN = os.environ['DOMAIN']
USERNAME = os.environ['USERNAME']
PASSWORD = os.environ['PASSWORD']

class DailyDownloadLimitReached(Exception):
    """Exception raised when the daily download limit is reached."""
    pass

async def random_wait(min_seconds=0.2, max_seconds=1.4):
    """
    Asynchronous function that waits for a random amount of time between min_seconds and max_seconds.
    """
    wait_time = random.uniform(min_seconds, max_seconds)
    await asyncio.sleep(wait_time)


async def login(page):
    await page.goto(f"https://{DOMAIN}/forum/index.php")
    await random_wait(min_seconds=3, max_seconds=4)
    await page.fill('input[name="login_username"]', USERNAME)
    await random_wait()
    await page.fill('input[name="login_password"]', PASSWORD)
    await random_wait()
    await page.click('input[type="submit"][name="login"][value="Вход"]')
    await random_wait()

async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await login(page)
    
    try:
        for url in tqdm(urls, desc="Downloading files", unit="file"):
            await page.goto(url)
            await random_wait(0.5, 3)
            
            try:
                download_link = page.locator('a.dl-stub.dl-link')
                
                # Start waiting for the download
                async with page.expect_download() as download_info:
                    await download_link.click()
                    await random_wait(1, 2)
                    # Check if the daily download limit has been reached after clicking
                    if await page.locator('text="Вы уже исчерпали суточный лимит скачиваний торрент-файлов"').count() > 0:
                        raise DailyDownloadLimitReached("Daily download limit reached")
                    
                download = await download_info.value

                # Create output directory if it doesn't exist
                output_dir = Path('../output')
                output_dir.mkdir(parents=True, exist_ok=True)

                # Wait for the download process to complete and save the downloaded file
                file_path = output_dir / download.suggested_filename
                await download.save_as(file_path)
                
                id = url.split('/')[-1]
                await random_wait()
            except DailyDownloadLimitReached as e:
                raise  # Re-raise the exception to stop the entire program
            except Exception as e:
                id = url.split('/')[-1]
                tqdm.write(f"Failed to download file with ID: {id}. Error: {str(e)}")
            
            await random_wait()
    except DailyDownloadLimitReached:
        print("Daily download limit reached. Stopping the program.")
    finally:
        time.sleep(3)
        await browser.close()



Downloading files:   0%|          | 0/788 [00:00<?, ?file/s]

Daily download limit reached. Stopping the program.
