https://mjoc.uitm.edu.my/main/

In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json


requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

base_url = 'https://mjoc.uitm.edu.my/main/#'
publication_base_url = 'https://mjoc.uitm.edu.my/main/index.php/publication-2/'


session = requests.Session()


try:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = session.get(base_url, headers=headers, verify=False)
    print("Status Code:", response.status_code)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        publication_menu = soup.find('a', string='Publication')
        if publication_menu:
            print("Publication menu found.")
            parent_li = publication_menu.find_parent('li', class_='sp-menu-item')
            if parent_li:
                dropdown_items = parent_li.find('div', class_='sp-dropdown-inner')
                if dropdown_items:
                    print("Dropdown items found.")
                    volume_links = dropdown_items.find_all('a')
                    publication_links = [urljoin(publication_base_url, link['href']) for link in volume_links]
                    with open('D:\\opensource-projects\\mesolitica\\mjoc\\publication_links.json', 'w') as f:
                        json.dump(publication_links, f)
                    print(f"Found {len(publication_links)} publication links.")
                else:
                    print("Dropdown items not found.")
            else:
                print("Parent 'li' not found.")
        else:
            print("Publication menu not found.")
    else:
        print(f"Failed to fetch the main page, status code: {response.status_code}")
except Exception as e:
    print(f"An error occurred: {e}")

Status Code: 200
Publication menu found.
Dropdown items found.
Found 15 publication links.


In [18]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json


requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
}


session = requests.Session()


def extract_pdf_links(volume_url):
    try:
        volume_response = session.get(volume_url, headers=headers, verify=False)
        if volume_response.status_code == 200:
            volume_soup = BeautifulSoup(volume_response.content, 'html.parser')
            pdf_links = [urljoin(publication_base_url, a['href']) for a in volume_soup.select('a[href$=".pdf"]')]
            return pdf_links
        else:
            print(f"Failed to fetch volume page {volume_url}, status code: {volume_response.status_code}")
            return []
    except Exception as e:
        print(f"An error occurred while processing volume {volume_url}: {e}")
        return []


with open('D:\\opensource-projects\\mesolitica\\mjoc\\publication_links.json', 'r') as fopen:
    publication_links = json.load(fopen)

full_pdf_urls = []


for volume_link in publication_links:
    pdf_links = extract_pdf_links(volume_link)
    full_pdf_urls.extend(pdf_links)

if full_pdf_urls:
    print(f"Total PDF URLs found: {len(full_pdf_urls)}")
else:
    print("No PDF URLs were found.")

output_file = 'D:\\opensource-projects\\mesolitica\\mjoc\\full_pdf_urls.json'
try:
    with open(output_file, 'w') as fopen:
        json.dump(full_pdf_urls, fopen)
        print(f"PDF URLs saved to {output_file}")
except Exception as e:
    print(f"An error occurred while saving PDF URLs to {output_file}: {e}")


Total PDF URLs found: 139
PDF URLs saved to D:\opensource-projects\mesolitica\mjoc\pdfs\full_pdf_urls.json


In [3]:
from tqdm.notebook import tqdm
import fitz
import os
import requests
import json
import re

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

pdf_directory = 'D:\\opensource-projects\\mesolitica\\mjoc\\pdfs'
os.makedirs(pdf_directory, exist_ok=True)

session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

with open('D:\\opensource-projects\\mesolitica\\mjoc\\full_pdf_urls.json', 'r') as fopen:
    full_pdf_urls = json.load(fopen)

extracted_data = []

for pdf_url in tqdm(full_pdf_urls):
    try:
        filename = pdf_url.split('/')[-1]
        pdf_path = os.path.join(pdf_directory, filename)
        
        pdf_response = session.get(pdf_url, headers=headers, stream=True, verify=False)
        with open(pdf_path, 'wb') as f:
            f.write(pdf_response.content)
        
        with fitz.open(pdf_path) as doc:
            full_text = ""
            for page in doc:
                full_text += page.get_text()

        title_pattern = re.compile(r'([A-Z\s]+)\n')
        authors_pattern = re.compile(r'([A-Z\s]+)\n(.+?)\n', re.DOTALL)

        title_search = title_pattern.search(full_text)
        authors_search = authors_pattern.search(full_text)

        title = title_search.group(1).strip() if title_search else 'Title not found'
        authors = authors_search.group(2).strip() if authors_search else 'Authors not found'
        
        clean_full_text = re.sub(r'\s+', ' ', full_text.replace('\n', ' ')).strip()
        
        extracted_data.append({
            'url': pdf_url,
            'title': title,
            'authors': authors,
            'text': clean_full_text
        })
    except Exception as e:
        print(f"An error occurred with {pdf_url}: {e}")

output_json_path = 'D:\\opensource-projects\\mesolitica\\mjoc\\mjoc_pdf_text.json'

with open(output_json_path, 'w', encoding='utf-8') as f:
    for entry in extracted_data:
        json_string = json.dumps(entry, ensure_ascii=False)
        f.write(f'{{"url": "{entry["url"]}", "title": "{entry["title"]}", "authors": "{entry["authors"]}", "text": "{entry["text"]}"}}\n')

print(f"Extracted data saved to {output_json_path}")


  0%|          | 0/139 [00:00<?, ?it/s]

An error occurred with https://mjoc.uitm.edu.my/main/images/journal/vol8-1-2023/4_DESIGN_BUILD_AND_FLY_THE_UITM’S_VERTICAL_TAKE-OFF_AND_LANDING_(VTOL)_AIRCRAFT.pdf: cannot open broken document
An error occurred with https://mjoc.uitm.edu.my/main/images/journal/vol8-1-2023/6_A SYSTEMATIC_LITERATURE_REVIEW_OF_MOBILE_APPLICATIONS_TO_ASSIST_PEOPLE_WITH_MILD_TO_MODERATE_DEMENTIA_IN_THEIR_DAILY_LIVES.pdf: cannot open broken document
Extracted data saved to D:\opensource-projects\mesolitica\mjoc\mjoc_pdf_text.json
