In [None]:
import json
import os
import random
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.srbija.gov.rs/dokument/844348/dokumenta-ministarstva-gradjevinarstva-saobracaja-i-infrastrukture-koja-se-ticu-moguceg-izvrsenja-krivicnog-dela-povodom-pada-nadstresnice-na-zeleznickoj-stanici-u-novom-sadu-1-novembra-2024-godine-22.php"

# Directory to save downloaded files
output_dir = "str-2-downloaded_documents"
os.makedirs(output_dir, exist_ok=True)

# List to store downloaded links
downloaded_links = []

# Send HTTP GET request to the URL
response = requests.get(url)
response.raise_for_status()

# Parse the webpage content
soup = BeautifulSoup(response.content, "html.parser")

# File extensions to look for
file_extensions = [".pdf", ".zip", ".rar", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".csv"]

# Find all links
for link in soup.find_all("a", href=True):
    href = link["href"]
    link_title = link.get_text(strip=True)
    if any(href.lower().endswith(ext) for ext in file_extensions):
        file_url = urljoin(url, href)
        file_name = os.path.join(output_dir, os.path.basename(href))

        # Download the file
        try:
            print(f"Downloading {file_url}...")
            file_response = requests.get(file_url)
            file_response.raise_for_status()
            with open(file_name, "wb") as f:
                f.write(file_response.content)
            print(f"Saved to {file_name}")

            # Save the link to the list
            downloaded_links.append({"file_name": file_name, "file_url": file_url, "link_title": link_title})
            print("------------------LINK TITLE-----------------------------")
            print(link_title)
            print("---------------------------------------------------------")
        except requests.RequestException as e:
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
            print(f">>>>>>>>>>>>>>>>>>>>>>>> Failed to download {file_url}: {e} >>>>>>>>>>>>>>>>>>>>>")
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

        # Add a random delay between downloads
        time_delay = random.uniform(2, 5)  # Random delay between 2 to 5 seconds
        print(f"Waiting for {time_delay:.2f} seconds before the next download...")
        time.sleep(time_delay)

# Save the links to a JSON file
json_path = os.path.join(output_dir, "str-2-downloaded_documents.json")
with open(json_path, "w", encoding="utf-8") as json_file:
    json.dump(downloaded_links, json_file, indent=4, ensure_ascii=False)

print("Download completed.")

In [1]:
import json

# Input and output file names
input_file = "/home/datatab/Documents/NADSTRESNICA/nadstresnica/dokumentacija/linkovi_ka_preuzetim_dokumentima.json"
output_file = "/home/datatab/Documents/NADSTRESNICA/nadstresnica/dokumentacija/decoded_linkovi_ka_preuzetim_dokumentima.json"

# Read the JSON file
with open(input_file, encoding="utf-8") as file:
    data = json.load(file)

# Write the decoded JSON to a new file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print(f"Decoded JSON saved to '{output_file}'")

Decoded JSON saved to '/home/datatab/Documents/NADSTRESNICA/nadstresnica/dokumentacija/decoded_linkovi_ka_preuzetim_dokumentima.json'
