# Scraping Data

In [14]:
import warnings
warnings.filterwarnings("ignore")
import csv
import requests
from bs4 import BeautifulSoup

In [37]:
articles = 'data/bcae_articles_urls_cn.txt'
dataset = "data/bcae_dataset_cn.csv"

In [None]:
# get text of an element if it exists
def get_element(soup, selector):
    element = soup.select_one(selector)
    return element.get_text(strip=True, separator=",") if element else ""

In [23]:
def get_element_text(soup, selector):
    """Fetch text from a specified CSS selector."""
    elements = soup.select(selector)
    return [element.get_text(strip=True) for element in elements] if elements else []

def extract_text_from_webpage(url):
    """Extract relevant information from a webpage."""
    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        title_cn = get_element_text(soup, 'p.articleTitle')
        org_cn = get_element_text(soup, 'div.noname')
        abstract_cn = get_element_text(soup, 'div.neirong')
        keywords_cn = get_element_text(soup, 'p.neirong')
        fund_project = get_element_text(soup, 'div.noname2')
        date = get_element_text(soup, 'p.qikanhao')
        views = get_element_text(soup, 'a.fir i.count')
        downloads = get_element_text(soup, 'a.sec i.count')

        return {
            "url": url,
            "title_cn": title_cn[0] if title_cn else "",
            "org_cn": org_cn[0] if org_cn else "",
            "abstract_cn": abstract_cn[0] if abstract_cn else "",
            "keywords_cn": keywords_cn[0] if keywords_cn else "",
            "fund_project": fund_project[0] if fund_project else "",
            "date": date[0] if date else "",
            "views": views[0] if views else "",
            "downloads": downloads[0] if downloads else ""
        }
    
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return None

with open(dataset, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = [
        "url", "title_cn", "org_cn", 
        "abstract_cn", "keywords_cn", "fund_project", 
        "date", "views", "downloads"
    ]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="|")
    writer.writeheader()

    with open(article_urls, "r") as urls_file:
        urls = urls_file.read().splitlines()

        for url in urls:
            data = extract_text_from_webpage(url)
            if data:
                writer.writerow(data)

print(f"Data has been extracted and saved to {dataset}")

KeyboardInterrupt: 

In [42]:
import time
import csv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup

# Define file paths
authors_csv = 'data/bcae_authors.csv'
articles = 'data/bcae_articles_urls_cn.txt'

with open(authors_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["url", "author_cn"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=",")
    writer.writeheader()

def initialize_driver():
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument("--headless")
    return webdriver.Firefox(options=firefox_options)

driver = initialize_driver()

data = []

# Read URLs from the articles file
with open(articles, 'r', encoding='utf-8') as file:
    urls = file.read().splitlines()

# Iterate over the URLs
for url in urls:
    try:
        driver.get(url)
        WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.ID, 'jdauthor'))
        )

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Locate the authors paragraph
        authors_paragraph = driver.find_element(By.ID, 'jdauthor')
        authors = [a.text.strip() for a in authors_paragraph.find_elements(By.CLASS_NAME, 'author')]

    except TimeoutException:
        authors = ['time_error']

    except WebDriverException:
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            try:
                time.sleep(1)
                driver.quit()
                driver = initialize_driver()
                driver.get(url)
                WebDriverWait(driver, 2).until(
                    EC.presence_of_element_located((By.ID, 'jdauthor'))
                )
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                authors_paragraph = driver.find_element(By.ID, 'jdauthor')
                authors = [a.text.strip() for a in authors_paragraph.find_elements(By.CLASS_NAME, 'author')]
                break

            except (WebDriverException, TimeoutException):
                retry_count += 1
                if retry_count == max_retries:
                    authors = ['web_error']

    finally:
        data.append({'url': url, 'author_cn': authors})

        with open(authors_csv, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['url', 'author_cn'], delimiter=",")
            # Join authors and remove any newlines
            cleaned_authors = ', '.join(authors).replace('\n', '').strip()
            writer.writerow({'url': url, 'author_cn': cleaned_authors})

driver.quit()

print(f"Extraction and saving to {authors_csv} completed.")

KeyboardInterrupt: 

In [None]:
with open(article_urls, 'r') as file:
    urls = file.read().splitlines()

driver = initialize_driver()

data = []

for url in urls:
    try:
        driver.get(url)

        time.sleep(1)

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        button = driver.find_element(By.XPATH, '//div[@class="xiangguanyanjiu duo relatedresearch_display"]/p[@class="title" and contains(text(), "相关研究")]')
        button.click()
        time.sleep(3)
        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')

        # Find all <a> elements with class "ArticleList"
        links = soup.find_all('p', class_='itemTitle')

        similar_list = [link.text.strip() for link in links]

    except TimeoutException:
        similar_list = ['error']
    
    except NoSuchElementException:
        similar_list = ['error']

    finally:
        data.append({'url': url, 'similar': similar_list})

        output_file = 'data/bcae_similar.csv' 
        fieldnames = ['url', 'similar']
        with open(output_file, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow({'url': url, 'similar': similar_list})

        print(f"Data for {url} has been extracted and saved to {output_file}")

driver.quit()

print(f"Extraction and saving to {output_file} completed.")

In [None]:
# fix broken lines in the csv
def fix_lines(dataset_csv, dataset_csv_fixed):
    with open(dataset_csv, mode='r', encoding='utf-8') as file:
        content = file.read()

    # Replace all line endings with empty string to remove them
    content = content.replace('\n', '')

    # Add a new line before each 'http' to separate URLs
    content = content.replace(
        'http://old2022.bulletin.cas.cn/', '\nhttp://old2022.bulletin.cas.cn/')

    with open(dataset_csv_fixed, mode='w', encoding='utf-8') as outfile:
        outfile.write(content)


fix_lines(dataset, dataset_fixed)

# Data Cleaning

In [1]:
import pandas as pd


df = pd.read_csv('data/bcae_dataset.csv')