# 下载《社会学研究》期刊文章

## 导入所需的包

In [None]:
import os,sys
import time
import re
import json
import logging
from tqdm import tqdm
import pickle

import pandas as pd
import numpy as np

from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

## 给出文件存储目录、常用函数和webdriver的基本参数

In [None]:
outpath = '../data/'
datapath = '../data/'

In [None]:
def save_dataframe(df, file, outpath,suffix=""):
    # 创建最终的文件名
    filename = f"{file}{suffix}"

    # 存为 Excel 文件（使用 utf-16 编码）
    excel_filepath = os.path.join(outpath, f"{filename}.xlsx")
    df.to_excel(excel_filepath, index=False, encoding='utf-16')

    # 存为 Python 二进制文件（Pickle 格式，也使用 utf-16 编码）
    pickle_filepath = os.path.join(outpath, f"{filename}.pkl")
    df.to_pickle(pickle_filepath, protocol=5)  # protocol=5 表示使用最高的 Pickle 协议版本
    
    #print(excel_filepath, 'saved.')

In [None]:
#driver输入参数
options=webdriver.ChromeOptions()
out_path=os.path.abspath(outpath)
prefs={'profile.default_content_settings.popups': 0, 'download.default_directory': out_path}
options.add_experimental_option('prefs', prefs)

## 下载指定年份的期刊目录

In [None]:
outpath = '../data/issue_page/'

In [None]:
def get_year(prompt):
    for _ in range(5):
        year = input(prompt)
        now = time.strftime("%Y%m%d_%H%M%S")[:4]
        if re.match(r"^\d+$", year) and int(year) >= 1986 and int(year) <= int(now):
            return int(year)
        else:
            print("无效的年份！请重新输入：")
    print("输入年份超过五次错误，程序终止！")
    exit()

def year_range():
    now = time.strftime("%Y%m%d_%H%M%S")[:4]
    start_year = get_year(''.join(["请输入起始年份（1986 <= start yr <=",now,"）："]))
    end_year = get_year(''.join(["请输入结束年份（1986 <= end yr <=",now,"）："]))

    # 检查结束年份是否有效
    while end_year < start_year:
        print("结束年份必须大于或等于起始年份！")
        end_year = get_year("请输入结束年份：")
    return start_year, end_year

    

In [None]:
start_year, end_year = year_range()
print("起始年份：", start_year)
print("结束年份：", end_year)

In [None]:
# 初始化logging模块
logging.basicConfig(filename=outpath+'download_'start_year+'_'+end_year+'.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

# 记录脚本开始运行的时间
start_script_time = time.time()
logging.info("Script started")

driver = webdriver.Chrome(ChromeDriverManager().install(),chrome_options=options)

successful_pages = []
failed_pages = []

for yr in range(start_year, end_year + 1):
    for issue_num in range(1, 7):
        issue_url = f"http://shxyj.ajcass.org/Magazine/?Year={yr}&Issue={issue_num}"
        
        try:
            driver.get(issue_url)
            try:
                driver.find_element_by_css_selector("table#tab tr")
                logging.info(f"Page for Year {yr}, Issue {issue_num} has content.")
                successful_pages.append(issue_url)
            except NoSuchElementException:
                logging.warning(f"Page for Year {yr}, Issue {issue_num} does not have content.")
                failed_pages.append(issue_url)
                    
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'html')))
            
            issue_content = driver.page_source
            fl_name = f"{yr}_{issue_num}"
            with open(f'{outpath}/{fl_name}.html', 'w', encoding='utf-8') as file:
                file.write(issue_content)
        
        except Exception as e:
            logging.error(f"Failed to download the page for Year {yr} Issue {issue_num}. Error: {e}")
            failed_pages.append(issue_url)
            continue
        time.sleep(2)

driver.quit()

# 记录脚本结束运行的时间
end_script_time = time.time()
logging.info("Script ended")

# 计算脚本运行总时长
total_time = end_script_time - start_script_time
logging.info(f"Total script run time: {total_time} seconds")

# 记录成功和失败的页面
logging.info(f"Successful pages: {successful_pages}")
logging.info(f"Failed pages: {failed_pages}")

# 记录开始和结束的年份
logging.info(f"Start year: {start_year}, End year: {end_year}")

In [None]:
# 缺少1996 issue-3

## 清理出期刊总目录

In [None]:
outpath = '../data/issue_page/'

In [None]:
def extract_articles_to_dataframe(html_file_path):
    # 读取HTML文件
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    data = []

    # 解析HTML，提取文章信息
    for tr in soup.select('table#tab tr'):
        td = tr.find_all('td')
        if len(td) > 1:
            ul = td[1].find('ul')
            if ul:
                li_items = ul.find_all('li', recursive=False)
                if len(li_items) >= 4:
                    title = li_items[0].text.strip()
                    abstract = li_items[1].text.strip()[len('[摘要]'):].strip() if li_items[1].text.startswith('[摘要]') else '无摘要'
                    author = li_items[2].text.strip().replace('作者：', '').replace('\n', ', ').strip()
                    views = li_items[3].text.strip().split(' ')[-2] if li_items[3].text.strip().split(' ') else '0'
                    article_url = li_items[0].find('a', href=True)['href'] if li_items[0].find('a', href=True) else '无文章链接'

                    data.append([title, abstract, author, views, article_url])

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['标题', '摘要', '作者', '浏览次数', '文章URL'])
    
    # 提取ID并更新URL
    df['ID'] = df['文章URL'].str.extract(r'(\d+)$')
    df['文章URL'] = 'http://shxyj.ajcass.org/' + df['文章URL']

    # 调整列顺序
    cols = ['ID'] + [col for col in df if col != 'ID']
    df = df[cols]

    return df

In [None]:
# 收集目录中所有的HTML文件名
files = [f for f in os.listdir(outpath) if f.endswith('.html')]

# 提取年份和期数，转换为整数，用于排序
files_sorted = sorted(files, key=lambda x: [int(part) for part in x.split('.')[0].split('_')])

# 初始化一个空的DataFrame来存储所有数据
all_data = pd.DataFrame()

# 按排序后的顺序遍历和处理每个文件
for filename in files_sorted:
    file_path = os.path.join(outpath, filename)
    year, issue = filename.split('.')[0].split('_')
    df = extract_articles_to_dataframe(file_path)
    df.insert(1, '年份', year)
    df.insert(2, '期数', issue)
    all_data = pd.concat([all_data, df], ignore_index=True)


In [None]:
save_dataframe(all_data, "master_index", outpath)

## 添加每篇文章的信息

In [None]:
datapath = '../data/issue_page/'
outpath = '../data/article_page/'

In [None]:
file_path = datapath+'master_index.pkl'
df = pd.read_pickle(file_path)
print(df.head())

### 下载每篇文章的介绍页面

In [None]:
# 设置日志配置
logging.basicConfig(filename=outpath+'article_info.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 记录起始时间
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
logging.info(f"开始执行下载任务，起始时间: {start_time}")

# 初始化 Selenium WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install(),chrome_options=options)

# 创建列表来存储下载失败的文章ID
failed_articles = []

# 遍历 DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="下载进度"):
    # 获取 ID 和文章 URL
    article_id = row['ID']
    article_url = row['文章URL']
    
    try:
        # 使用 Selenium 下载网页
        driver.get(article_url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'html')))
        article_content = driver.page_source
        
        # 保存网页内容到文件
        with open(f'{outpath}{article_id}.html', 'w', encoding='utf-8') as file:
            file.write(article_content)
        
        # 记录成功下载
        logging.info(f"成功下载文章 {article_id}")
    
    except Exception as e:
        # 记录下载失败
        logging.error(f"下载文章 {article_id} 失败: {e}")
        # 将失败的文章ID添加到列表中
        failed_articles.append(article_id)
    
# 关闭 WebDriver
driver.quit()

# 记录截止时间
end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
logging.info(f"下载任务执行完成，截止时间: {end_time}")

# 记录统计信息
logging.info(f"总文章数: {len(df)}")
logging.info(f"成功下载文章数: {len(df)-len(failed_articles)}")
logging.info(f"下载失败文章数: {len(failed_articles)}")

# 记录所有失败的文章ID
if failed_articles:
    logging.warning(f"以下文章下载失败: {failed_articles}")
else:
    logging.info("所有文章下载成功。")

### 继续提取信息

In [None]:
# 定义一个函数来提取网页中的信息
def extract_info(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        soup = BeautifulSoup(content, 'html.parser')

        # 提取英文标题
        english_title = soup.select_one('td:contains("英文标题") + td').text.strip()

        # 提取英文摘要
        english_abstract = soup.select_one('td:contains("英文摘要") + td').text.strip()

        # 提取作者单位
        author_affiliation = soup.select_one('td:contains("作者单位") + td').text.strip()

        # 提取期刊
        journal = soup.select_one('td:contains("期刊") + td a').text.strip()

        # 提取年.期:页码
        issue_info = soup.select_one('td:contains("年.期:页码") + td').text.strip()

        # 提取中图分类号
        classification = soup.select_one('td:contains("中图分类号") + td').text.strip()

        # 提取关键词
        keywords = soup.select_one('td:contains("关键词") + td').text.strip()

        # 提取英文关键词
        english_keywords = soup.select_one('td:contains("英文关键词") + td').text.strip()

        # 提取项目基金
        project_fund = soup.select_one('td:contains("项目基金") + td').text.strip()

    except Exception as e:
        logging.error(f"Error extracting information from {file_path}: {str(e)}")
        #return {key: '-1' for key in ['英文标题', '英文摘要', '作者单位', '期刊', '年.期:页码', '中图分类号', '关键词', '英文关键词', '项目基金']}
    
    try:
        # 提取下载链接
        download_link = 'http://shxyj.ajcass.org' + soup.find('a', href=re.compile(r'^/Admin/UploadFile/'))['href']
    except Exception as e:
        logging.error(f"Error extracting information from {file_path}: {str(e)}")
        #return {key: '-1' for key in ['英文标题', '英文摘要', '作者单位', '期刊', '年.期:页码', '中图分类号', '关键词', '英文关键词', '项目基金']}
    
    try:
        # 创建一个字典用于构建DataFrame
        info_dict = {
            '英文标题': english_title if 'english_title' in locals() else '-1',
            '英文摘要': english_abstract if 'english_abstract' in locals() else '-1',
            '作者单位': author_affiliation if 'author_affiliation' in locals() else '-1',
            '期刊': journal if 'journal' in locals() else '-1',
            '年.期:页码': issue_info if 'issue_info' in locals() else '-1',
            '中图分类号': classification if 'classification' in locals() else '-1',
            '关键词': keywords if 'keywords' in locals() else '-1',
            '英文关键词': english_keywords if 'english_keywords' in locals() else '-1',
            '项目基金': project_fund if 'project_fund' in locals() else '-1',
            '下载链接': download_link if 'download_link' in locals() else '-1'
        }
    except Exception as e:
        logging.error(f"Error extracting information from {file_path}: {str(e)}")
        return {key: '-1' for key in ['英文标题', '英文摘要', '作者单位', '期刊', '年.期:页码', '中图分类号', '关键词', '英文关键词', '项目基金']}
    
    return info_dict

In [None]:
# 设置 logging
logging.basicConfig(filename=outpath+'Article_Index_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 设置 tqdm 进度条
tqdm.pandas()


# 创建一个列表，用于存储每篇文章的信息
articles_info = []

# 遍历文件夹中的网页文件
for article_id in tqdm(df['ID'], desc='Processing Articles'):
    file_name = f'{article_id}.html'
    file_path = os.path.join(outpath, file_name)

    # 提取信息并添加到列表中
    info_dict = extract_info(file_path)
    info_dict['ID']= article_id

    if info_dict:
        articles_info.append(info_dict)

# 将列表转换为DataFrame
articles_info_df = pd.DataFrame(articles_info)

In [None]:
# 将两个DataFrame连接在一起，按照原有的df['ID']列连接
merged_df = pd.merge(df, articles_info_df, on='ID', how='right')

In [None]:
# 手动修正出错的单元格
merged_df.loc[merged_df['ID'] == '72349', '英文标题'] = "State , Market and Society: The multidimensional impetus of reducing the Qinhuai River's pollution"
merged_df.loc[merged_df['ID'] == '72349', '英文摘要'] = "Since 1950, the Qinhuai River in Nanjing city has undergone five times of large-scale renovations. ... (以下是英文摘要的修改内容)"
merged_df.loc[merged_df['ID'] == '72349', '作者单位'] = "南京大学社会学系 上海高校社会学E-研究院(上海大学)"
merged_df.loc[merged_df['ID'] == '72349', '期刊'] = "社会学研究"
merged_df.loc[merged_df['ID'] == '72349', '年.期:页码'] = "2008.1:143-164"
merged_df.loc[merged_df['ID'] == '72349', '中图分类号'] = "X321"
merged_df.loc[merged_df['ID'] == '72349', '文章编号'] = ""  # 更新文章编号为空字符串
merged_df.loc[merged_df['ID'] == '72349', '关键词'] = "国家； 市场与社会； 污染治理； 中国特色；"
merged_df.loc[merged_df['ID'] == '72349', '英文关键词'] = ""
merged_df.loc[merged_df['ID'] == '72349', '项目基金'] = "教育部重大攻关课题“中国城市化理论重构与城市化发展战略”(课题项目批准号:05JZD0038)的成果之一;上海高校社会学E-研究院(上海大学)资助"


In [None]:
merged_df.fillna('-1', inplace=True)
merged_df = merged_df.replace("", -1)

In [None]:
save_dataframe(merged_df, "Article_Index", outpath)

## 下载文献原文

In [None]:
datapath = '../data/article_page/'
outpath = '../data/pdf_file/'

In [None]:
file_path = datapath+'Article_Index.pkl'
df = pd.read_pickle(file_path)
print(df.head())

### 补救措施

In [None]:
## 如果出错，找到出错的序号，重新运行此处及之后的代码
# 找到ID为74380的行的索引
start_index = df[df['ID'] == "84242"].index[0]

# 取出ID为74380及以下的所有行
df = df.iloc[start_index:]
len(df)

In [None]:
# 读取文件内容
with open(outpath+'failed_ids.pkl', 'rb') as f:
    bytes_data = f.read()

# 将字节流反序列化为列表
failed_ids = pickle.loads(bytes_data)

In [None]:
##储存出错id
#bytes_data = pickle.dumps(failed_ids)
#with open(outpath+'failed_ids.pkl', 'wb') as f:
#    f.write(bytes_data)

### 下载

In [None]:
# 遍历下载链接，下载文件
failed_ids = []  # 用于存储下载失败的ID

In [None]:
# 配置日志记录
logging.basicConfig(filename=outpath+'download_full_text.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 获取ID和下载链接列
id_and_links = df[['ID', '下载链接']]

# 初始化 Selenium WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install(),chrome_options=options)

for index, row in tqdm(id_and_links.iterrows(), total=len(id_and_links), desc="Downloading"):
    article_id = row['ID']
    download_link = row['下载链接']

    if download_link != -1:
        try:
            # 使用Selenium获取文件名
            driver.get(download_link)

            # 获取文件扩展名，保留原有的扩展名
            file_extension = urlparse(download_link).path.split('.')[-1]
            
            # 使用requests下载文件
            response = requests.get(download_link, stream=True,timeout=40)
            response.raise_for_status()

            # 构造文件路径，使用文章ID和原有的扩展名
            file_path = os.path.join(outpath, f'{article_id}.{file_extension}')
            
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            
            logging.info(f"Downloaded file for ID {article_id} and saved as {file_path}")
        except requests.exceptions.RequestException as e:
            logging.error(f"Error downloading file for ID {article_id}: {e}")
            failed_ids.append(article_id)  # 记录下载失败的ID
            # 将字节流写入文件
            bytes_data = pickle.dumps(failed_ids)
            with open(outpath+'failed_ids.pkl', 'wb') as f:
                f.write(bytes_data)
                
        except Exception as e:
            logging.error(f"Other error downloading file for ID {article_id}: {e}")
            failed_ids.append(article_id)  # 记录下载失败的ID
            # 将字节流写入文件
            bytes_data = pickle.dumps(failed_ids)
            with open(outpath+'failed_ids.pkl', 'wb') as f:
                f.write(bytes_data)

# 记录下载失败的ID到日志
if failed_ids:
    logging.warning(f"Download failed for the following IDs: {failed_ids}")

# 关闭WebDriver
driver.quit()


## 检查下载结果

In [None]:
datapath = '../data/article_page/'
outpath = '../data/pdf_file/'

In [None]:
file_path = datapath+'Article_Index.pkl'
df = pd.read_pickle(file_path)
print(df.head())

In [None]:
# 读取outpath下所有以.pdf结尾的文件
pdf_files = [f for f in os.listdir(outpath) if f.endswith('.pdf')]

# 生成文件名列表（不包括扩展名）
succ = [os.path.splitext(f)[0] for f in pdf_files]

# 从df中取出“ID”一列生成列表total
total = df['ID'].tolist()

# 将在total中但是不在succ中的元素组成列表fail
fail = [id_ for id_ in total if id_ not in succ]

# 打印结果
print("fail:", fail)