<a href="https://colab.research.google.com/github/AmrRagab0/-SAAT-Semi-Automated-Annotation-Technique-/blob/main/Scraping_for_matches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook's scrapes websites like (youtube, https://www.sport-video.org.ua/football1.html ) for matches and automates their downloading process

In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin

# Base URL of the website
base_url = 'https://www.sport-video.org.ua'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Function to extract match links from a football page
def get_match_links(page_url):
    response = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all match links
    match_links = []
    for link in soup.find_all('a', href=True):
        if 'html' in link['href']:  # Assuming match pages have .html in their URLs
            # Convert relative URL to absolute URL
            absolute_url = urljoin(base_url, link['href'])
            match_links.append(absolute_url)

    return match_links

# Function to extract torrent links from a match page
def get_torrent_link(match_url):
    response = requests.get(match_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the torrent link
    for link in soup.find_all('a', href=True):
        if 'torrent' in link['href']:
            # Convert relative URL to absolute URL
            absolute_url = urljoin(base_url, link['href'])
            return absolute_url

    return None

# Function to download a torrent file
def download_torrent(torrent_url, download_dir):
    response = requests.get(torrent_url, headers=headers)
    filename = os.path.join(download_dir, torrent_url.split('/')[-1])

    # Save the torrent file
    with open(filename, 'wb') as file:
        file.write(response.content)

    print(f'Downloaded: {filename}')

# Main script
def main():
    # Directory to save the torrent files
    download_dir = 'torrents'
    os.makedirs(download_dir, exist_ok=True)

    # List to store all match links
    all_match_links = []

    # Step 1: Generate football page URLs (football1.html to football20.html)
    football_pages = [f'{base_url}/football{i}.html' for i in range(1, 21)]

    # Step 2: Scrape match links from each football page
    for page_url in football_pages:
        print(f'Processing football page: {page_url}')
        match_links = get_match_links(page_url)
        all_match_links.extend(match_links)

    # Ensure we have 100 matches
    all_match_links = all_match_links[:100]

    # Step 3: Extract torrent links and download the files
    for match_link in all_match_links:
        print(f'Processing match page: {match_link}')
        torrent_link = get_torrent_link(match_link)

        if torrent_link:
            download_torrent(torrent_link, download_dir)
        else:
            print(f'No torrent link found for: {match_link}')

# Run the script
if __name__ == '__main__':
    main()

Processing football page: https://www.sport-video.org.ua/football1.html
Processing football page: https://www.sport-video.org.ua/football2.html
Processing football page: https://www.sport-video.org.ua/football3.html
Processing football page: https://www.sport-video.org.ua/football4.html
Processing football page: https://www.sport-video.org.ua/football5.html
Processing football page: https://www.sport-video.org.ua/football6.html
Processing football page: https://www.sport-video.org.ua/football7.html
Processing football page: https://www.sport-video.org.ua/football8.html
Processing football page: https://www.sport-video.org.ua/football9.html
Processing football page: https://www.sport-video.org.ua/football10.html
Processing football page: https://www.sport-video.org.ua/football11.html
Processing football page: https://www.sport-video.org.ua/football12.html
Processing football page: https://www.sport-video.org.ua/football13.html
Processing football page: https://www.sport-video.org.ua/foo

In [None]:
!rm -rf "/content/torrents"

In [None]:
!rmdir --help

Usage: rmdir [OPTION]... DIRECTORY...
Remove the DIRECTORY(ies), if they are empty.

      --ignore-fail-on-non-empty
                  ignore each failure that is solely because a directory
                    is non-empty
  -p, --parents   remove DIRECTORY and its ancestors; e.g., 'rmdir -p a/b/c' is
                    similar to 'rmdir a/b/c a/b a'
  -v, --verbose   output a diagnostic for every directory processed
      --help     display this help and exit
      --version  output version information and exit

GNU coreutils online help: <https://www.gnu.org/software/coreutils/>
Full documentation <https://www.gnu.org/software/coreutils/rmdir>
or available locally via: info '(coreutils) rmdir invocation'
