# Analysing Website Structure

First we should investigate the structure of the website. Article data we are looking for is in the section 'Past Issues' (过刊目录). 

Each issue page contains links to the full article  in PDF or HTML, and abstracts (摘要) which we will explore next.
Article pages contain a lot of data about publications:
- title
- date
- issue
- authors
- affiliations
- abstracts
- keywords
- associated fund projects
- views and downloads statistics.

Recent text data is presented in both Chinese and English, but older articles include only Chinese.

# Scraping Data

The old version of the BCAS website doesn't use JavaScript animations, so the classic `BeautifulSoup` library is enough at this stage.

In [4]:
from bs4 import BeautifulSoup
import requests
import csv

## Issues

Our final goal is to retrieve the data on the articles. The website structure shows that the links to the articles can be found inside issues. So, our first move is to get the links to the issues.

In [2]:
# the webpage with links to all issues
url = "http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/issue_browser.aspx"
# path to save URLs
issues = 'data/bcas_issues.txt'

In [3]:
def get_urls(url):
    """
    Extracts all URLs from a given webpage.
    This function sends a GET request to the specified URL, parses the HTML content,
    and extracts all hyperlinks (href attributes of <a> tags) from the page.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    list or None: A list of URLs found on the page if successful, None if an error occurs.

    Raises:
    Any exceptions from the requests or BeautifulSoup libraries are caught and printed.

    Example:
    >>> urls = get_urls('https://example.com')
    >>> if urls:
    ...     print(f"Found {len(urls)} URLs")
    ... else:
    ...     print("Failed to retrieve URLs")
    """
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        urls = []
        links = soup.find_all('a')

        for link in links:
            href = link.get('href')
            if href:
                urls.append(href)

        return urls

    except Exception as e:
        print("Error:", str(e))
        return None

In [4]:
# function to save URLs to a txt
def save_urls(urls, txt):
    with open(txt, 'w') as file:
        for url in urls:
            if url.startswith('issue_list.aspx?year_id='):
                file.write(
                    'http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/' + url + '\n')

    print("Issues URLs saved to", txt)

In [None]:
urls = get_urls(url)
save_urls(urls, issues)

## Articles

Once we get the links to the issues, we can itearate through them and retrieve the links to the desired articles.

In [118]:
articles = 'data/bcas_articles.txt'

In [None]:
with open(articles, "a") as output:
    # Read the list of webpages from issues.txt
    with open(issues, "r") as file:
        issue_urls = file.read().splitlines()

    # iterate through each URL
    for url in issue_urls:
        article_urls = get_urls(url)
        if article_urls:
            # Save article URLs by appending to the file
            for article_url in article_urls:
                if article_url.startswith('view_abstract.aspx?file_no='):
                    output.write(
                        'http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/' + article_url + '\n')

print("Article URLs saved to", articles)

In [None]:
def remove_duplicates(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Remove duplicates while preserving order
    unique_lines = []
    seen = set()
    for line in lines:
        if line not in seen:
            seen.add(line)
            unique_lines.append(line)

    # Write the unique lines back to the file
    with open(file_path, 'w') as file:
        file.writelines(unique_lines)

    print(f"Duplicates removed. Check {file_path}")


remove_duplicates(articles)

## Article Data

Now we can scrape the data for each BCAS article.
To do so we need to analyze the HTML structure of the pages and determine CSS selectors for the desired elements. 

### Define Elements

In [220]:
# get text of an element if it exists
def get_element(soup, selector):
    element = soup.select_one(selector)
    return element.get_text(strip=True, separator=",") if element else ""

In [225]:
# function to extract text using BeautifulSoup
def get_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # extracting data using CSS selectors
        title_cn = get_element(soup, 'span#FileTitle')
        title_en = get_element(soup, 'span#EnTitle')
        author_cn = get_element(soup, 'div.cn_author')
        author_en = get_element(soup, 'div.en_author')
        org_cn = get_element(soup, 'div.cn_unit')
        org_en = get_element(soup, 'div.en_unit')
        abstract_cn = get_element(soup, 'div.zw_zhaiyao')
        abstract_en = get_element(soup, 'div.yw_zhaiyao')
        keywords_cn = get_element(soup, 'div.zw_gjc')
        keywords_en = get_element(soup, 'div.yw_gjc')
        fund_project = get_element(soup, 'div.jjxm')
        date = get_element(soup, 'div.d_deta.fr')
        views = get_element(soup, 'span#ClickNum')
        downloads = get_element(soup, 'span#PDFClickNum')

        return {
            "url": url,
            "title_cn": title_cn,
            "title_en": title_en,
            "author_cn": author_cn,
            "author_en": author_en,
            "org_cn": org_cn,
            "org_en": org_en,
            "abstract_cn": abstract_cn,
            "abstract_en": abstract_en,
            "keywords_cn": keywords_cn,
            "keywords_en": keywords_en,
            "fund_project": fund_project,
            "date": date,
            "views": views,
            "downloads": downloads,
        }

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return None

In [6]:
# dataset csv path
dataset = "data/bcas_dataset.csv"

In [None]:
with open(dataset, mode="w", newline="") as csv_file:
    fieldnames = [
        "url", "date", "views", "downloads",
        "author_cn", "author_en",
        "title_cn", "title_en",
        "org_cn", "org_en",
        "abstract_cn", "abstract_en",
        "keywords_cn", "keywords_en",
        "fund_project"
    ]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="|")
    writer.writeheader()

    # read the list of URLs from the file
    with open(articles, "r") as file:
        article_urls = file.read().splitlines()

        # iterate through each url, extract data, and write to csv
        for url in article_urls:
            data = get_data(url)
            if data:
                writer.writerow(data)

print(f"Data has been extracted and saved to {dataset}")

In [1]:
dataset_fixed = "data/bcas_dataset_fixed.csv"

In [59]:
# fix broken lines in the csv
def fix_lines(dataset_csv, dataset_csv_fixed):
    with open(dataset_csv, mode='r', encoding='utf-8') as file:
        content = file.read()

    # Replace all line endings with empty string to remove them
    content = content.replace('\n', '')

    # Add a new line before each 'http' to separate URLs
    content = content.replace(
        'http://old2022.bulletin.cas.cn/', '\nhttp://old2022.bulletin.cas.cn/')

    with open(dataset_csv_fixed, mode='w', encoding='utf-8') as outfile:
        outfile.write(content)


fix_lines(dataset, dataset_fixed)

### Similar Articles

Our goal is to clasterize the articles using available texts -- so the more useful texts we have, the better. On the pages there is a section called "Similar Articles" (相似文献). The titles of similar publications may be useful for topic modeling, for it increases the chances for articles with similar referencies to appear in the same cluster.

Accessing this data requires clicking a "Similar Articles" button, otherwise the text is not present on the page. We can simulate clicking (and a do lot of other useful stuff) using Selenium library. 

In [25]:
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By

In [2]:
similar_csv = 'data/bcas_similar.csv'

In [None]:
with open(similar_csv, mode="w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["url", "similar"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="|")
    writer.writeheader()


def initialize_driver():
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument("--headless")
    return webdriver.Firefox(options=firefox_options)


driver = initialize_driver()

data = []

with open(articles, 'r') as file:
    urls = file.read().splitlines()

# Iterate over the URLs
for url in urls:
    try:
        driver.get(url)

        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
            (By.XPATH, '//div[text()="相似文献"]'))).click()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ArticleList")))

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # find all <a> with class "ArticleList"
        links = soup.find_all('a', class_='ArticleList')

        # extract the text from each element
        similar_list = [link.text.strip() for link in links]

    except TimeoutException:
        similar_list = ['time_error']

    except WebDriverException:
        max_retries = 5
        retry_count = 0
        while retry_count < max_retries:
            try:
                time.sleep(3)
                driver.quit()
                driver = initialize_driver()
                driver.get(url)
                WebDriverWait(driver, 2).until(EC.element_to_be_clickable(
                    (By.XPATH, '//div[text()="相似文献"]'))).click()
                WebDriverWait(driver, 2).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "ArticleList")))
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                links = soup.find_all('a', class_='ArticleList')
                similar_list = [link.text.strip() for link in links]
                break

            except (WebDriverException, TimeoutException):
                retry_count += 1
                if retry_count == max_retries:
                    similar_list = ['web_error']

    finally:
        data.append({'url': url, 'similar': similar_list})

        with open(similar_csv, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=[
                                    'url', 'similar'], delimiter="|")
            writer.writerow({'url': url, 'similar': similar_list})

driver.quit()

print(f"Extraction and saving to {similar_csv} completed.")

### Merge

In [3]:
import pandas as pd
df = pd.read_csv(dataset_fixed, sep='|')
df.shape

(7211, 15)

In [4]:
df.head()

Unnamed: 0,url,date,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):0-0",320,438,,,目录,,,,中文摘要:,Abstract:,中文关键词:,keywords:,基金项目:
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):1-9",975,998,李国杰,LI Guojie,智能化科研（AI4R）：第五科研范式,AI4R: The fifth scientific research paradigm,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,文章将“智能化科研”（AI4R）称为第五科研范式，概括它的一系列特征包括：（1）...","Abstract:,This article refers to “AI for Resea...","中文关键词:,智能化科研,涌现,组合爆炸问题,非确定计算,大科学模型,科研大平台","keywords:,AI4R,emergence,combinatorial explosi...",基金项目:
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):10-16",630,542,"鄂维南,1,2","E Weinan,1,2",AI助力打造科学研究新范式,AI helps to establish a new paradigm for scien...,(1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084),"(1.Perking University, Beijing 100871, China;2...","中文摘要:,科学研究的目的是发现基本原理和解决实际问题。尽管人类在发现基本原理和解决实际问题...","Abstract:,The main purpose of scientific resea...","中文关键词:,人工智能驱动的科学研究,科学计算,安卓模式","keywords:,scientific research driven by AI,sci...",基金项目:
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):17-26",470,448,"谭光明,,,贾伟乐,,,王展,,,元国军,,,邵恩,,,孙凝晖*","TAN Guangming,,,JIA Weile,,,WANG Zhan,,,YUAN G...",面向模拟智能的计算系统,Computing system for simulation intelligence,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,科学研究中的计算机模拟称为科学模拟（scientific simulation）...","Abstract:,This study refers computer simulatio...","中文关键词:,科学模拟,模拟智能,人工智能,计算系统,Z级计算","keywords:,scientific simulation,simulation int...","基金项目:,国家杰出青年科学基金（T2125013）"
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):27-33",579,632,"王飞跃,1,2,,,王雨桐,1,3","WANG Fei-Yue,1,2,,,WANG Yutong,1,3",数字科学家与平行科学：AI4S和S4AI的本源与目标,Digital scientists and parallel sciences: The ...,(1.中国科学院自动化研究所 复杂系统管理与控制国家重点实验室 北京 100190;2.澳门...,(1.State Key Laboratory for Management and Con...,"中文摘要:,围绕人工智能（AI）大模型技术的最新进展，从AI4S （人工智能驱动的科学研究）...","Abstract:,Based on recent development in found...","中文关键词:,人工智能,智能科技,平行智能,基础模型,数字科学家,平行科技,分布式自主科学","keywords:,AI,intelligent science,parallel inte...","基金项目:,澳门科学技术发展基金（0093/2023/RIA2），国家自然科学基金（6153..."


In [5]:
similar_df = pd.read_csv(similar_csv)

In [6]:
similar_df.head()

Unnamed: 0,url,similar
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,[]
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"['科研信息化发展态势和思考', '数据科学与计算智能：内涵、范式与机遇', '人工智能驱动..."
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', 'GPT技术变革对基础科学研究的..."
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"['信息化:从计算机科学到计算科学', '科学大数据智能分析软件的现状与趋势', '中国高通..."
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', '适度超前推动科研基础平台建设 ..."


In [7]:
df = pd.merge(df, similar_df, on='url', how='left')

In [8]:
df.head()

Unnamed: 0,url,date,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project,similar
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):0-0",320,438,,,目录,,,,中文摘要:,Abstract:,中文关键词:,keywords:,基金项目:,[]
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):1-9",975,998,李国杰,LI Guojie,智能化科研（AI4R）：第五科研范式,AI4R: The fifth scientific research paradigm,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,文章将“智能化科研”（AI4R）称为第五科研范式，概括它的一系列特征包括：（1）...","Abstract:,This article refers to “AI for Resea...","中文关键词:,智能化科研,涌现,组合爆炸问题,非确定计算,大科学模型,科研大平台","keywords:,AI4R,emergence,combinatorial explosi...",基金项目:,"['科研信息化发展态势和思考', '数据科学与计算智能：内涵、范式与机遇', '人工智能驱动..."
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):10-16",630,542,"鄂维南,1,2","E Weinan,1,2",AI助力打造科学研究新范式,AI helps to establish a new paradigm for scien...,(1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084),"(1.Perking University, Beijing 100871, China;2...","中文摘要:,科学研究的目的是发现基本原理和解决实际问题。尽管人类在发现基本原理和解决实际问题...","Abstract:,The main purpose of scientific resea...","中文关键词:,人工智能驱动的科学研究,科学计算,安卓模式","keywords:,scientific research driven by AI,sci...",基金项目:,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', 'GPT技术变革对基础科学研究的..."
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):17-26",470,448,"谭光明,,,贾伟乐,,,王展,,,元国军,,,邵恩,,,孙凝晖*","TAN Guangming,,,JIA Weile,,,WANG Zhan,,,YUAN G...",面向模拟智能的计算系统,Computing system for simulation intelligence,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,科学研究中的计算机模拟称为科学模拟（scientific simulation）...","Abstract:,This study refers computer simulatio...","中文关键词:,科学模拟,模拟智能,人工智能,计算系统,Z级计算","keywords:,scientific simulation,simulation int...","基金项目:,国家杰出青年科学基金（T2125013）","['信息化:从计算机科学到计算科学', '科学大数据智能分析软件的现状与趋势', '中国高通..."
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"中国科学院院刊:2024,39(1):27-33",579,632,"王飞跃,1,2,,,王雨桐,1,3","WANG Fei-Yue,1,2,,,WANG Yutong,1,3",数字科学家与平行科学：AI4S和S4AI的本源与目标,Digital scientists and parallel sciences: The ...,(1.中国科学院自动化研究所 复杂系统管理与控制国家重点实验室 北京 100190;2.澳门...,(1.State Key Laboratory for Management and Con...,"中文摘要:,围绕人工智能（AI）大模型技术的最新进展，从AI4S （人工智能驱动的科学研究）...","Abstract:,Based on recent development in found...","中文关键词:,人工智能,智能科技,平行智能,基础模型,数字科学家,平行科技,分布式自主科学","keywords:,AI,intelligent science,parallel inte...","基金项目:,澳门科学技术发展基金（0093/2023/RIA2），国家自然科学基金（6153...","['人工智能驱动的科学研究新范式：从AI4S到智能科学', '适度超前推动科研基础平台建设 ..."


# Data Cleaning

The data we got is quite messy with lots of missing values, extra comas and other punctuation signs, some columns can be split into two to make more sense.

In this cases we can do some cleaning with Pandas and regular expressions.

In [22]:
import pandas as pd


dataset_fixed = "data/bcas_dataset_fixed.csv"
df = pd.read_csv(dataset_fixed, sep='|')
similar_csv = 'data/bcas_similar.csv'
similar_df = pd.read_csv(similar_csv)

df = pd.merge(df, similar_df, on='url', how='left')

## Date & Issue

First, we should transform strings like this '2024,39(1):0-0' into more meaningful form.

This string contains three features -- year, issue, and pages. The pattern for separating these features is consistent throughout the dataset, so by a simple split by coma and colon  we can create columns 'date', 'issue', and 'pages'.

In [23]:
# 中国科学院院刊:2024,39(1):0-0
import regex as re

df['date'] = df['date'].str.replace('中国科学院院刊:', '', regex=True)

In [24]:
df.head(1)

Unnamed: 0,url,date,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project,similar
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,"2024,39(1):0-0",320,438,,,目录,,,,中文摘要:,Abstract:,中文关键词:,keywords:,基金项目:,[]


In [25]:
df[['date', 'issue']] = df['date'].str.split(',', n=1, expand=True)

In [26]:
df.head()

Unnamed: 0,url,date,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project,similar,issue
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,320,438,,,目录,,,,中文摘要:,Abstract:,中文关键词:,keywords:,基金项目:,[],39(1):0-0
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,975,998,李国杰,LI Guojie,智能化科研（AI4R）：第五科研范式,AI4R: The fifth scientific research paradigm,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,文章将“智能化科研”（AI4R）称为第五科研范式，概括它的一系列特征包括：（1）...","Abstract:,This article refers to “AI for Resea...","中文关键词:,智能化科研,涌现,组合爆炸问题,非确定计算,大科学模型,科研大平台","keywords:,AI4R,emergence,combinatorial explosi...",基金项目:,"['科研信息化发展态势和思考', '数据科学与计算智能：内涵、范式与机遇', '人工智能驱动...",39(1):1-9
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,630,542,"鄂维南,1,2","E Weinan,1,2",AI助力打造科学研究新范式,AI helps to establish a new paradigm for scien...,(1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084),"(1.Perking University, Beijing 100871, China;2...","中文摘要:,科学研究的目的是发现基本原理和解决实际问题。尽管人类在发现基本原理和解决实际问题...","Abstract:,The main purpose of scientific resea...","中文关键词:,人工智能驱动的科学研究,科学计算,安卓模式","keywords:,scientific research driven by AI,sci...",基金项目:,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', 'GPT技术变革对基础科学研究的...",39(1):10-16
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,470,448,"谭光明,,,贾伟乐,,,王展,,,元国军,,,邵恩,,,孙凝晖*","TAN Guangming,,,JIA Weile,,,WANG Zhan,,,YUAN G...",面向模拟智能的计算系统,Computing system for simulation intelligence,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,科学研究中的计算机模拟称为科学模拟（scientific simulation）...","Abstract:,This study refers computer simulatio...","中文关键词:,科学模拟,模拟智能,人工智能,计算系统,Z级计算","keywords:,scientific simulation,simulation int...","基金项目:,国家杰出青年科学基金（T2125013）","['信息化:从计算机科学到计算科学', '科学大数据智能分析软件的现状与趋势', '中国高通...",39(1):17-26
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,579,632,"王飞跃,1,2,,,王雨桐,1,3","WANG Fei-Yue,1,2,,,WANG Yutong,1,3",数字科学家与平行科学：AI4S和S4AI的本源与目标,Digital scientists and parallel sciences: The ...,(1.中国科学院自动化研究所 复杂系统管理与控制国家重点实验室 北京 100190;2.澳门...,(1.State Key Laboratory for Management and Con...,"中文摘要:,围绕人工智能（AI）大模型技术的最新进展，从AI4S （人工智能驱动的科学研究）...","Abstract:,Based on recent development in found...","中文关键词:,人工智能,智能科技,平行智能,基础模型,数字科学家,平行科技,分布式自主科学","keywords:,AI,intelligent science,parallel inte...","基金项目:,澳门科学技术发展基金（0093/2023/RIA2），国家自然科学基金（6153...","['人工智能驱动的科学研究新范式：从AI4S到智能科学', '适度超前推动科研基础平台建设 ...",39(1):27-33


In [27]:
df[['issue', 'page']] = df['issue'].str.split(':', n=1, expand=True)

In [38]:
df = df.rename(columns={'date':'year'})

In [39]:
df.head()

Unnamed: 0,url,year,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project,similar,issue,page
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,320,438,not_specified,not_specified,目录,,,,中文摘要:,Abstract:,中文关键词:,keywords:,基金项目:,[],39(1),0-0
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,975,998,李国杰,LI Guojie,智能化科研（AI4R）：第五科研范式,AI4R: The fifth scientific research paradigm,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,文章将“智能化科研”（AI4R）称为第五科研范式，概括它的一系列特征包括：（1）...","Abstract:,This article refers to “AI for Resea...","中文关键词:,智能化科研,涌现,组合爆炸问题,非确定计算,大科学模型,科研大平台","keywords:,AI4R,emergence,combinatorial explosi...",基金项目:,"['科研信息化发展态势和思考', '数据科学与计算智能：内涵、范式与机遇', '人工智能驱动...",39(1),1-9
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,630,542,鄂维南,E Weinan,AI助力打造科学研究新范式,AI helps to establish a new paradigm for scien...,(1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084),"(1.Perking University, Beijing 100871, China;2...","中文摘要:,科学研究的目的是发现基本原理和解决实际问题。尽管人类在发现基本原理和解决实际问题...","Abstract:,The main purpose of scientific resea...","中文关键词:,人工智能驱动的科学研究,科学计算,安卓模式","keywords:,scientific research driven by AI,sci...",基金项目:,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', 'GPT技术变革对基础科学研究的...",39(1),10-16
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,470,448,"谭光明,贾伟乐,王展,元国军,邵恩,孙凝晖","TAN Guangming,JIA Weile,WANG Zhan,YUAN Guojun,...",面向模拟智能的计算系统,Computing system for simulation intelligence,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...","中文摘要:,科学研究中的计算机模拟称为科学模拟（scientific simulation）...","Abstract:,This study refers computer simulatio...","中文关键词:,科学模拟,模拟智能,人工智能,计算系统,Z级计算","keywords:,scientific simulation,simulation int...","基金项目:,国家杰出青年科学基金（T2125013）","['信息化:从计算机科学到计算科学', '科学大数据智能分析软件的现状与趋势', '中国高通...",39(1),17-26
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,579,632,"王飞跃,王雨桐","WANG Fei-Yue,WANG Yutong",数字科学家与平行科学：AI4S和S4AI的本源与目标,Digital scientists and parallel sciences: The ...,(1.中国科学院自动化研究所 复杂系统管理与控制国家重点实验室 北京 100190;2.澳门...,(1.State Key Laboratory for Management and Con...,"中文摘要:,围绕人工智能（AI）大模型技术的最新进展，从AI4S （人工智能驱动的科学研究）...","Abstract:,Based on recent development in found...","中文关键词:,人工智能,智能科技,平行智能,基础模型,数字科学家,平行科技,分布式自主科学","keywords:,AI,intelligent science,parallel inte...","基金项目:,澳门科学技术发展基金（0093/2023/RIA2），国家自然科学基金（6153...","['人工智能驱动的科学研究新范式：从AI4S到智能科学', '适度超前推动科研基础平台建设 ...",39(1),27-33


## Abstracts & Keywords

Next we can remove irrelevant words like '中文摘要:', 'Abstract:', '中文关键词:', 'keywords:', '基金项目:'. This can be done through regular expression replacement. 

In [40]:
# remove redundant text and strip comas
df['abstract_cn'] = df['abstract_cn'].str.replace(
    '中文摘要:', '', regex=True).str.lstrip(',')
df['abstract_en'] = df['abstract_en'].str.replace(
    'Abstract:', '', regex=True).str.lstrip(',')
df['keywords_cn'] = df['keywords_cn'].str.replace(
    '中文关键词:', '', regex=True).str.lstrip(',')
df['keywords_en'] = df['keywords_en'].str.replace(
    'keywords:', '', regex=True).str.lstrip(',')
df['fund_project'] = df['fund_project'].str.replace(
    '基金项目:', '', regex=True).str.lstrip(',')

In [41]:
df.head()

Unnamed: 0,url,year,views,downloads,author_cn,author_en,title_cn,title_en,org_cn,org_en,abstract_cn,abstract_en,keywords_cn,keywords_en,fund_project,similar,issue,page
0,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,320,438,not_specified,not_specified,目录,,,,,,,,,[],39(1),0-0
1,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,975,998,李国杰,LI Guojie,智能化科研（AI4R）：第五科研范式,AI4R: The fifth scientific research paradigm,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...",文章将“智能化科研”（AI4R）称为第五科研范式，概括它的一系列特征包括：（1）人工智能（A...,This article refers to “AI for Research（AI4R）”...,"智能化科研,涌现,组合爆炸问题,非确定计算,大科学模型,科研大平台","AI4R,emergence,combinatorial explosion problem...",,"['科研信息化发展态势和思考', '数据科学与计算智能：内涵、范式与机遇', '人工智能驱动...",39(1),1-9
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,630,542,鄂维南,E Weinan,AI助力打造科学研究新范式,AI helps to establish a new paradigm for scien...,(1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084),"(1.Perking University, Beijing 100871, China;2...",科学研究的目的是发现基本原理和解决实际问题。尽管人类在发现基本原理和解决实际问题上已经取得了...,The main purpose of scientific research is to ...,"人工智能驱动的科学研究,科学计算,安卓模式","scientific research driven by AI,scientific co...",,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', 'GPT技术变革对基础科学研究的...",39(1),10-16
3,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,470,448,"谭光明,贾伟乐,王展,元国军,邵恩,孙凝晖","TAN Guangming,JIA Weile,WANG Zhan,YUAN Guojun,...",面向模拟智能的计算系统,Computing system for simulation intelligence,(中国科学院计算技术研究所 北京 100190),"(Institute of Computing Technology, Chinese Ac...",科学研究中的计算机模拟称为科学模拟（scientific simulation），文章从其狭...,This study refers computer simulation in scien...,"科学模拟,模拟智能,人工智能,计算系统,Z级计算","scientific simulation,simulation intelligence,...",国家杰出青年科学基金（T2125013）,"['信息化:从计算机科学到计算科学', '科学大数据智能分析软件的现状与趋势', '中国高通...",39(1),17-26
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,2024,579,632,"王飞跃,王雨桐","WANG Fei-Yue,WANG Yutong",数字科学家与平行科学：AI4S和S4AI的本源与目标,Digital scientists and parallel sciences: The ...,(1.中国科学院自动化研究所 复杂系统管理与控制国家重点实验室 北京 100190;2.澳门...,(1.State Key Laboratory for Management and Con...,围绕人工智能（AI）大模型技术的最新进展，从AI4S （人工智能驱动的科学研究）到S4AI ...,Based on recent development in foundation mode...,"人工智能,智能科技,平行智能,基础模型,数字科学家,平行科技,分布式自主科学","AI,intelligent science,parallel intelligence,f...",澳门科学技术发展基金（0093/2023/RIA2），国家自然科学基金（61533019）,"['人工智能驱动的科学研究新范式：从AI4S到智能科学', '适度超前推动科研基础平台建设 ...",39(1),27-33


## Authors 

We need to clean the author column as well and remove extra columns, numbers and other symbols like "*".

In [42]:
def normalize_text(text):
    # remove numbers
    text = re.sub(r'\d+', '', text)

    # replace multiple commas with one
    text = re.sub(r',+', ',', text)
    # remove asterisks
    text = re.sub(r'\*', '', text)
    # remove any leading or trailing commas and whitespace
    text = text.strip(',').strip()

    return text

In [43]:
df['author_en'] = df['author_en'].fillna('not_specified')
df['author_en'] = df['author_en'].apply(normalize_text)

In [44]:
# some articles do not have a specified name of the author
# but each publication is written by a person, so instead of NaN we fill missing values with "not_specified" text
df['author_cn'] = df['author_cn'].fillna('not_specified')
df['author_cn'] = df['author_cn'].apply(normalize_text)

In [45]:
df['author_cn'].head()

0            not_specified
1                      李国杰
2                      鄂维南
3    谭光明,贾伟乐,王展,元国军,邵恩,孙凝晖
4                  王飞跃,王雨桐
Name: author_cn, dtype: object

In [None]:
authors.to_csv('data/authors_flat.csv')

## Organizations

Cleaning organization data is a tricky part, beacuse this part is the most inconsistent and messy.
Example of the organization description: (1.北京大学 北京 100871;2.北京科学智能研究院 北京 100084). One articles can be written by several people from different instituitions. The affiliation info also includes data about city, postal codes and job titles in a lot of cases.

In [42]:
orgs = df[['url', 'org_cn']].dropna()

# Replace '!' with a space and split 'org_cn' on ';'
orgs['org_cn'] = orgs['org_cn'].str.replace(
    '!', ' ', regex=True).str.split(';')

# Explode the DataFrame and strip whitespace and parentheses
orgs_expld = orgs.explode('org_cn')
orgs_expld['org_cn'] = orgs_expld['org_cn'].str.strip().str.strip('()')

# Remove sequences of digits (4 to 8 digits long) at the end and strip again
orgs_expld['org_cn'] = orgs_expld['org_cn'].str.replace(
    r'\s*\d{4,8}\s*$', ' ', regex=True).str.strip()

# Remove leading sequences of digits with optional decimals and trailing spaces or commas
# Also remove digits and spaces or commas at any position and leading non-Chinese text followed by a space
orgs_expld['org_cn'] = orgs_expld['org_cn'].str.replace(
    r'^\d+\.\d*[\s,]*|\d+[\s,]*|^.+?\s+', '', regex=True)

We don't need the postal codes and job titles for our research, but the city data can be useful for geospatial analysis.

We can get the relevant cities by checking what city from the list of Chinese cities (get it from Baidu) appears in the string.

In [43]:
city_df = pd.read_csv('data/cities.csv')

In [44]:
# create a regex from the city list
cities_list = city_df['city_cn'].tolist()
cities_pattern = r'\s*(?<=)(\s*' + \
    '|'.join(map(re.escape, cities_list)) + r')\b'

# compile the regex pattern
city_regex = re.compile(cities_pattern)

# extract the city from a string


def extract_city(text):
    match = city_regex.search(text)
    if match:
        return match.group(0)  # get the match
    return None


orgs_expld['city_cn'] = orgs_expld['org_cn'].apply(extract_city)

In [45]:
orgs_expld['city_cn'] = orgs_expld['org_cn'].apply(extract_city)
orgs_expld['city_cn'] = orgs_expld['city_cn'].str.strip()

In [47]:
mapping_dict = dict(zip(city_df['city_cn'], city_df['city_en']))
orgs_expld['city_en'] = orgs_expld['city_cn'].map(mapping_dict)

In [48]:
orgs_expld['org_cn'] = orgs_expld['org_cn'].apply(
    lambda x: city_regex.sub('', x).strip())

Next we need to remove the job titles from the strings.

For this purpose we can create a list of job titles which appear in the dataset.

In [49]:
job_titles = [
    "所长", "研究员",
    "院长", "校长",
    "主席", "总经理",
    "教授", "院士",
    "博士", "学部委员",
    "委员长", "组长",
    "主任", "处长",
    "部长", "主任",
    "党委书记", "秘书",
    "局长", "总裁",
    "台长", "名誉",
    "特邀顾问", "执行",
    "主管", "工程师",
    "专利代理人", "导师",
    "助理", "书记", "理事长", "馆长"
]

job_titles_pattern = '|'.join([fr'\s*{title}\s*' for title in job_titles])


def remove_job_titles(text):
    return re.sub(job_titles_pattern, '', text).strip()


orgs_expld['org_cn'] = orgs_expld['org_cn'].apply(remove_job_titles)

In [50]:
# Replace all occurrences of the characters in the pattern with an empty string
orgs_expld['org_cn'] = orgs_expld['org_cn'].str.replace(
    r"[、《》副]", "", regex=True)

# Remove empty parentheses
orgs_expld['org_cn'] = orgs_expld['org_cn'].str.replace(
    r"\(\)", "", regex=True)

In [52]:
orgs_expld = orgs_expld[orgs_expld.org_cn != '']
orgs_expld.head()

Unnamed: 0,url,org_cn,city_cn,city_en
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京大学,北京,Beijing
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京科学智能研究院,北京,Beijing
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 复杂系统管理与控制国家重点实验室,北京,Beijing
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,澳门科技大学 创新工程学院,澳门,Macau
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 多模态人工智能系统全国重点实验室,北京,Beijing


The next important issue is the degree of detail about organizations we are interested in. The affiliation data includes in some cases the title of the head organization and a subdivision (lab, office, group, ). For now we will focus on the head organizations, e.g. if there is a string like '中国科学院自动化研究所 复杂系统管理与控制国家重点实验室' we count it like '中国科学院自动化研究所'.

This also can be achieved through regular exprssions.

In [53]:
# List of head organizations
head_org_endings = ["科学院", "研究所", "研究院", "大学", "学院"]

# Create regex
pattern = '|'.join(head_org_endings)

# Regex pattern to detect Chinese characters
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')


def extract_head_org(text):
    if not text.strip():
        return text  # Return the original text if it is empty or only whitespace

    # Check if the text contains any Chinese characters
    if not chinese_char_pattern.search(text):
        return text  # Return the original text if it contains no Chinese characters

    # Check specifically for "中国科学院大学" ignoring other symbols or characters
    if re.search(r"中国科学院大学", text):
        return "中国科学院大学"

    if "中国科学院院刊" in text:
        return "中国科学院院刊"

    if "上海天文台" in text:
        return "中国科学院上海天文台"

    if "北京天文台" in text:
        return "中国科学院北京天文台"

    if "南京天文台" in text:
        return "中国科学院南京天文台"

    if "国家天文台" in text:
        return "中国科学院国家天文台"

    # Special case for "中国科学院" followed by "研究所", "中心", or "研究院"
    zky_with_suffix_match = re.search(r"(中国科学院.*?(研究所|中心|研究院))", text)
    if zky_with_suffix_match:
        return zky_with_suffix_match.group(1)

    # Search for the first occurrence of any of the common endings in the full text
    match = re.search(fr"(.+?({pattern}))", text)
    if match:
        # Extract and return the head organization
        return match.group(1)
    else:
        # No common ending found, consider the first part as the head organization
        return text.split()[0] if ' ' in text else text


# Apply the function to the DataFrame
orgs_expld['org_cn_head'] = orgs_expld['org_cn'].apply(extract_head_org)

In [54]:
orgs_expld.org_cn_head.value_counts().reset_index().head(10)

Unnamed: 0,org_cn_head,count
0,中国科学院,689
1,中国科学院大学,484
2,中国科学院科技战略咨询研究院,217
3,中国科学院地理科学与资源研究所,162
4,中国科学院科技政策与管理科学研究所,112
5,北京大学,72
6,中国科学院西北生态环境资源研究院,49
7,中国科学院数学与系统科学研究院,48
8,清华大学,43
9,中国科学院生态环境研究中心,38


In [55]:
orgs_head = orgs_expld['org_cn_head'].unique()

# Create a df from the unique values and save for additional processing
orgs_head_df = pd.DataFrame(orgs_head, columns=['org_cn_head'])
orgs_head_df.to_csv('data/orgs_head.csv', index=False)

In [9]:
orgs_head_clean_df = pd.read_csv('data/orgs_head_clean.csv')
orgs_head_clean_df.head()

Unnamed: 0,org_cn_head,org_cn_head_clean
0,北京师范大学,北京师范大学
1,“万种园”项目首席科学家,“万种园”项目
2,“中国科学与人文论坛”长,中国科学与人文论坛
3,“论坛”处宣传,“论坛”处宣传
4,)中国科学院,中国科学院


The next step is to translate the data to English. Note that we need official English titles, so doing everything through machine translation is not the best fit.

We can try to retrieve some English titles from Baidu using Beautiful Soup again.

In [None]:
def fetch_english_title(org_cn):
    base_url = 'https://baike.baidu.com/item/'
    url = base_url + org_cn
    print(f"Processing: {org_cn}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Locate the divs with class "itemWrapper_yPk3z" or "itemWrapper_qgaYJ"
        divs = soup.find_all(
            'div', class_=['itemWrapper_yPk3z', 'itemWrapper_qgaYJ'])
        for div in divs:
            # Look for the dt element with the text "外文名"
            dt = div.find('dt', class_='basicInfoItem_hdTH0 itemName_iCg2R')
            if dt and '外文名' in dt.text:
                # Find the dd element with class "basicInfoItem_hdTH0 itemValue_rxziX"
                dd = div.find(
                    'dd', class_='basicInfoItem_hdTH0 itemValue_rxziX')
                if dd:
                    # Extract English title from the span element
                    span = dd.find('span', class_='text_v1llE')
                    if span:
                        english_title = span.text.strip()
                        print(
                            f"Found English title for {org_cn}: {english_title}")
                        return english_title
        print(f"No English title found for {org_cn}")
    except Exception as e:
        print(f"Error fetching data for {org_cn}: {e}")
    return None


# Assuming orgs_head_clean_df is your DataFrame
orgs_head_clean_df['org_cn_head_en'] = orgs_head_clean_df['org_cn_head_clean'].apply(
    fetch_english_title)

# Save the DataFrame with English titles to a CSV file
orgs_head_clean_df.to_csv('data/bcas_orgs_head_clean_en.csv', index=False)

Not all organizations are present on Baidu or have English translation. We can translate these with Google Translate through `deeptranslator`.

In [None]:
from deep_translator import GoogleTranslator
from tqdm import tqdm
import pandas as pd


def get_translation(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(str(text))
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(f"Error translating text: {text}. Error: {e}")
        return 'error'


tqdm.pandas()

# Apply translation only to rows where 'org_cn_head_en' is null
orgs_head_clean_df.loc[orgs_head_clean_df['org_cn_head_en'].isnull(
), 'org_cn_head_en'] = orgs_head_clean_df.loc[orgs_head_clean_df['org_cn_head_en'].isnull(), 'org_cn_head_clean'].progress_apply(get_translation)

100%|██████████| 659/659 [19:41<00:00,  1.79s/it]


In [None]:
orgs_head_clean_df.head()

Unnamed: 0,org_cn_head,org_cn_head_clean,org_cn_head_en
0,北京师范大学,北京师范大学,Beijing Normal University
1,“万种园”项目首席科学家,“万种园”项目,"""Ten Thousand Plants Garden"" Project"
2,“中国科学与人文论坛”长,中国科学与人文论坛,China Science and Humanities Forum
3,“论坛”处宣传,“论坛”处宣传,Publicity at the Forum
4,)中国科学院,中国科学院,Chinese Academy of Sciences


Finally, we add some final text processing touches in Excel/Google Sheets and update the dataframe.

In [11]:
orgs_head_clean_df = pd.read_csv('data/bcas_orgs_head_clean_en_fin.csv')
orgs_head_clean_df.head()

Unnamed: 0,org_cn_head,org_cn_head_clean,org_cn_head_en
0,北京师范大学,北京师范大学,Beijing Normal University
1,“万种园”项目首席科学家,“万种园”项目,"""Ten Thousand Plants Garden"" Project"
2,“中国科学与人文论坛”长,中国科学与人文论坛,China Science and Humanities Forum
3,“论坛”处宣传,“论坛”处宣传,Publicity at the Forum
4,)中国科学院,中国科学院,CAS


In [58]:
mapping_dict = dict(
    zip(orgs_head_clean_df['org_cn_head'], orgs_head_clean_df['org_cn_head_en']))
orgs_expld['org_cn_head_en'] = orgs_expld['org_cn_head'].map(mapping_dict)

In [60]:
orgs_expld.head()

Unnamed: 0,url,org_cn,city_cn,city_en,org_cn_head,org_cn_head_en
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京大学,北京,Beijing,北京大学,Peking University
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京科学智能研究院,北京,Beijing,北京科学智能研究院,Beijing Institute of Scientific and Intelligen...
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 复杂系统管理与控制国家重点实验室,北京,Beijing,中国科学院自动化研究所,"Institute of Automation, CAS"
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,澳门科技大学 创新工程学院,澳门,Macau,澳门科技大学,Macau University of Science and Technology
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 多模态人工智能系统全国重点实验室,北京,Beijing,中国科学院自动化研究所,"Institute of Automation, CAS"


In [61]:
# Create mapping dictionaries for city_cn and city_en based on org_cn_head_en
city_cn_mapping = orgs_expld.dropna(subset=['city_cn']).drop_duplicates(
    subset=['org_cn_head_en']).set_index('org_cn_head_en')['city_cn'].to_dict()
city_en_mapping = orgs_expld.dropna(subset=['city_en']).drop_duplicates(
    subset=['org_cn_head_en']).set_index('org_cn_head_en')['city_en'].to_dict()

# Fill missing values using the mapping
orgs_expld['city_cn'] = orgs_expld.apply(
    lambda row: city_cn_mapping.get(row['org_cn_head_en'], row['city_cn']), axis=1)
orgs_expld['city_en'] = orgs_expld.apply(
    lambda row: city_en_mapping.get(row['org_cn_head_en'], row['city_en']), axis=1)

In [62]:
title_dict = df.set_index('url')['title_cn'].to_dict()
year_dict = df.set_index('url')['date'].to_dict()

orgs_expld['title_cn'] = orgs_expld['url'].map(title_dict)
orgs_expld['year'] = orgs_expld['url'].map(year_dict)

In [68]:
orgs_expld = orgs_expld.rename(columns={'org_cn_head_en': 'orgs_head'})
orgs_expld['orgs_head'] = orgs_expld['orgs_head'].str.title()
orgs_expld['orgs_head'] = orgs_expld['orgs_head'].str.replace(
    'Cas', 'CAS', regex=True)
orgs_expld['orgs_head'] = orgs_expld['orgs_head'].str.replace(
    'Of', 'of', regex=True)
orgs_expld['orgs_head'] = orgs_expld['orgs_head'].str.replace(
    'And', 'and', regex=True)

In [69]:
orgs_expld.head()

Unnamed: 0,url,org_cn,city_cn,city_en,org_cn_head,orgs_head,title_cn,year
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京大学,北京,Beijing,北京大学,Peking University,AI助力打造科学研究新范式,2024
2,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,北京科学智能研究院,北京,Beijing,北京科学智能研究院,Beijing Institute of Scientific and Intelligen...,AI助力打造科学研究新范式,2024
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 复杂系统管理与控制国家重点实验室,北京,Beijing,中国科学院自动化研究所,"Institute of Automation, CAS",数字科学家与平行科学：AI4S和S4AI的本源与目标,2024
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,澳门科技大学 创新工程学院,澳门,Macau,澳门科技大学,Macau University of Science and Technology,数字科学家与平行科学：AI4S和S4AI的本源与目标,2024
4,http://old2022.bulletin.cas.cn/zgkxyyk/ch/read...,中国科学院自动化研究所 多模态人工智能系统全国重点实验室,北京,Beijing,中国科学院自动化研究所,"Institute of Automation, CAS",数字科学家与平行科学：AI4S和S4AI的本源与目标,2024


In [70]:
orgs_expld.to_csv('data/orgs_flat.csv', index=False)

## Fund Projects

Finally, we need to clean the data about associated fund projects. 

In [12]:
fund = df[['date', 'title_cn', 'fund_project']]
fund.shape

(7216, 3)

In [104]:
fund = fund[fund['fund_project'].str.strip() != '']
fund.shape

(1054, 4)

In [105]:
fund['fund_project'] = fund['fund_project'].str.replace(
    '基金项目：', '', regex=True)

In [106]:
fund['fund_project'] = (fund['fund_project'].str.replace(',', '，', regex=False)
                                            .str.replace(';', '，', regex=False)
                                            .str.replace(r'\(', '（', regex=True)
                                            .str.replace(r'\)', '）', regex=True)
                                            .str.replace('!', '，', regex=False)
                                            .str.replace('；', '，', regex=False)
                        )

In [107]:
def replace_commas_in_brackets(text):
    return re.sub(r'[（(](.*?)[）)]', lambda m: re.sub(r'[!,;，；]', '、', m.group()), text)


fund['fund_project'] = fund['fund_project'].apply(replace_commas_in_brackets)

In [108]:
fund['fund_project'] = fund['fund_project'].str.split('，')

In [110]:
fund_expld = fund.explode('fund_project').reset_index()
fund_expld['fund_project'] = fund_expld['fund_project'].str.replace('）、', '），')
fund_expld['fund_project'] = fund_expld['fund_project'].str.split('，')
fund_expld = fund_expld.explode('fund_project')

In [112]:
# drop irrelevant rows
fund_expld = fund_expld.drop(index=[1643] + list(range(1654, 1667)))
fund_expld = fund_expld.drop(index=[1645, 1646, 1839])

In [None]:
fund_expld.to_csv('data/bcas_fund_projects.csv', index=False)

In [113]:
df.to_csv('data/bcas_dataset_fin.csv', index=False)