In [1]:
from pathlib import Path

In [2]:
def load_urls_from_file(file_path):
    input_file = Path(file_path)
    with input_file.open('r') as file:
        urls = set(line.strip() for line in file if line.strip() and not line.startswith('#'))
    return urls

input_file_path = '../input.txt'
urls = load_urls_from_file(input_file_path)
print(f"Number of unique URLs loaded from input file: {len(urls)}")

Number of unique URLs loaded from input file: 301


In [3]:
from pathlib import Path
import re

def get_existing_ids(output_dir):
    file_pattern = re.compile(r'.*\.t(\d+)\..*')
    existing_ids = set()
    for file in Path(output_dir).iterdir():
        if not file.name.startswith('.'):
            match = file_pattern.match(file.name)
            if match:
                existing_ids.add(match.group(1))
    return existing_ids

output_dir = Path('../output')
existing_ids = get_existing_ids(output_dir)
print(f"Number of existing files: {len(existing_ids)}")

Number of existing files: 200


In [4]:
import re

def filter_urls(urls, existing_ids=None):
    # Define the regex pattern to extract the ID from the URL
    url_pattern = re.compile(r'.*t=(\d+).*')

    # Filter out URLs that have already been downloaded and remove duplicates
    filtered_urls = set()
    for url in urls:
        match = url_pattern.match(url)
        if match:
            url_id = match.group(1)
            if existing_ids is None or url_id not in existing_ids:
                filtered_urls.add(url)

    return list(filtered_urls)

# Use the function to filter the URLs
urls = filter_urls(urls, existing_ids)

print(f"Number of unique URLs to be processed: {len(urls)}")


Number of unique URLs to be processed: 101


In [5]:
assert urls

In [6]:
from playwright.async_api import async_playwright
from dotenv import load_dotenv
import os
import random
import asyncio
import time
from tqdm.notebook import tqdm

load_dotenv()
DOMAIN = os.environ['DOMAIN']
USERNAME = os.environ['USERNAME']
PASSWORD = os.environ['PASSWORD']

DETAILED_LOGGING = True

class DailyDownloadLimitReached(Exception):
    """Exception raised when the daily download limit is reached."""
    pass

async def random_wait(min_seconds=0.2, max_seconds=1.4):
    """
    Asynchronous function that waits for a random amount of time between min_seconds and max_seconds.
    """
    wait_time = random.uniform(min_seconds, max_seconds)
    await asyncio.sleep(wait_time)

async def login(page):
    await page.goto(f"https://{DOMAIN}/forum/index.php")
    await random_wait(min_seconds=3, max_seconds=4)
    await page.fill('input[name="login_username"]', USERNAME)
    await random_wait()
    await page.fill('input[name="login_password"]', PASSWORD)
    await random_wait()
    await page.click('input[type="submit"][name="login"][value="Вход"]')
    await random_wait()

async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    await login(page)
    await asyncio.sleep(5)
    
    try:
        for url in tqdm(urls, desc="Downloading files", unit="file"):
            url = url.replace('.cc', '.net')
            await page.goto(url, wait_until='domcontentloaded')
            await random_wait(0.5, 3)
            
            try:
                download_link = page.locator('a.dl-stub.dl-link')
                
                async with page.expect_download() as download_info:
                    await download_link.click()
                    await random_wait(1, 2)
                    if await page.locator('text="Вы уже исчерпали суточный лимит скачиваний торрент-файлов"').count() > 0:
                        raise DailyDownloadLimitReached("Daily download limit reached")
                    
                download = await download_info.value

                output_dir = Path('../output')
                output_dir.mkdir(parents=True, exist_ok=True)

                file_path = output_dir / download.suggested_filename
                await download.save_as(file_path)
                
                id = url.split('/')[-1]
                await random_wait()
            except DailyDownloadLimitReached:
                raise
            except Exception as e:
                id_ = url.split('/')[-1]
                print(f"Error occurred while processing ID {id_}: {str(e)}")
            
            await random_wait()
    except DailyDownloadLimitReached:
        print("Daily limit reached, stopping.")
    finally:
        time.sleep(3)
        await browser.close()


Downloading files:   0%|          | 0/101 [00:00<?, ?file/s]

Daily limit reached, stopping.
