#### API爬取文献信息

In [1]:
import os
import requests
import json
import time

# --- Configuration ---
EMAIL = "2224272392@ybu.edu.cn"
BASE_DIR = os.path.join("..", "..", "data_origin", "OpenAlex_paper")
BASE_URL = "https://api.openalex.org/works"

TOPICS = {
    "T10471": "Climate_Change_Policy_and_Economics",
    "T12013": "Sustainable_Development_and_Env_Policy",
    "T12656": "Climate_Adaptation_and_Migration",
    "T11488": "Climate_Communication_and_Perception"
}

def fetch_raw_data(tid, tname):
    print(f"\n[Start] {tname} ({tid})")
    
    # Create directory
    save_dir = os.path.join(BASE_DIR, tname)
    os.makedirs(save_dir, exist_ok=True)
    raw_path = os.path.join(save_dir, "Raw_Works.jsonl")
    
    # API Params
    params = {
        "filter": f"topics.id:{tid}",
        "per_page": 200,
        "cursor": "*", 
        "sort": "publication_date:desc"
    }
    
    headers = {
        "User-Agent": f"PythonScript/1.0 (mailto:{EMAIL})",
        "Accept-Encoding": "gzip, deflate"
    }

    # Reset counters for this topic
    count = 0
    total_count = None
    
    with open(raw_path, "w", encoding="utf-8") as f:
        while params["cursor"]:
            try:
                # Request with timeout
                resp = requests.get(BASE_URL, params=params, headers=headers, timeout=60)
                if resp.status_code != 200:
                    print(f"Error: {resp.status_code}")
                    break
                
                data = resp.json()
                
                # Get estimation from the first page metadata
                if total_count is None:
                    total_count = data.get("meta", {}).get("count", 0)
                    print(f"Total Estimation: {total_count}")
                
                results = data.get("results", [])
                if not results: break
                
                # Write to file
                for work in results:
                    f.write(json.dumps(work, ensure_ascii=False) + "\n")
                
                count += len(results)
                
                # Print progress every 1000 items
                if count % 1000 == 0:
                    pct = (count / total_count * 100) if total_count else 0
                    print(f"   ...Fetched {count} / {total_count} ({pct:.1f}%)")
                
                # Update cursor
                params["cursor"] = data.get("meta", {}).get("next_cursor")
                
            except Exception as e:
                print(f"Network error: {e}, retrying in 3s...")
                time.sleep(3)

    print(f"[Done] Saved {count} records to {raw_path}")

# --- Execution ---
print(f"Data Path: {os.path.abspath(BASE_DIR)}")
for tid, tname in TOPICS.items():
    fetch_raw_data(tid, tname)

Data Path: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary_V2\data_origin\OpenAlex_paper

[Start] Climate_Change_Policy_and_Economics (T10471)
Total Estimation: 211646
   ...Fetched 1000 / 211646 (0.5%)
   ...Fetched 2000 / 211646 (0.9%)
   ...Fetched 3000 / 211646 (1.4%)
   ...Fetched 4000 / 211646 (1.9%)
   ...Fetched 5000 / 211646 (2.4%)
   ...Fetched 6000 / 211646 (2.8%)
   ...Fetched 7000 / 211646 (3.3%)
   ...Fetched 8000 / 211646 (3.8%)
   ...Fetched 9000 / 211646 (4.3%)
   ...Fetched 10000 / 211646 (4.7%)
   ...Fetched 11000 / 211646 (5.2%)
   ...Fetched 12000 / 211646 (5.7%)
   ...Fetched 13000 / 211646 (6.1%)
   ...Fetched 14000 / 211646 (6.6%)
   ...Fetched 15000 / 211646 (7.1%)
   ...Fetched 16000 / 211646 (7.6%)
   ...Fetched 17000 / 211646 (8.0%)
   ...Fetched 18000 / 211646 (8.5%)
   ...Fetched 19000 / 211646 (9.0%)
   ...Fetched 20000 / 211646 (9.4%)
   ...Fetched 21000 / 211646 (9.9%)
   ...Fetched 22000 / 211646 (10.4%)
   ...Fetched 23000 / 211646 (10.9%)

#### OD构建

In [1]:
import os
import json
import itertools # 用于生成两两组合
from collections import defaultdict

# --- Configuration ---
BASE_DIR = os.path.join("..", "..", "data_origin", "OpenAlex_paper")
TOTAL_DIR = os.path.join(BASE_DIR, "Total_Data_Analysis")

TOPICS = {
    "T10471": "Climate_Change_Policy_and_Economics",
    "T12013": "Sustainable_Development_and_Env_Policy",
    "T12656": "Climate_Adaptation_and_Migration",
    "T11488": "Climate_Communication_and_Perception"
}

# --- Helpers ---
def get_region_label(is_south):
    if is_south is True: return "Global South"
    if is_south is False: return "Global North"
    return "Unknown"

def get_interaction_type(src_south, tgt_south):
    if src_south is None or tgt_south is None: return "Unknown"
    val = int(src_south) + int(tgt_south)
    if val == 0: return "North-North"
    if val == 2: return "South-South"
    return "North-South"

def process_all_data():
    # 1. Setup Global Writers
    os.makedirs(TOTAL_DIR, exist_ok=True)
    
    f_total_net = open(os.path.join(TOTAL_DIR, "OD_Network.jsonl"), "w", encoding="utf-8")
    f_total_rich = open(os.path.join(TOTAL_DIR, "Works_Enriched.jsonl"), "w", encoding="utf-8")
    path_total_base = os.path.join(TOTAL_DIR, "Country_Baseline.jsonl")
    
    global_stats = defaultdict(lambda: {"score": 0.0, "is_south": None, "count": 0})
    seen_work_ids = set()
    
    print(f"Root Path: {os.path.abspath(BASE_DIR)}")
    print(f"Total Analysis Path: {os.path.abspath(TOTAL_DIR)}\n")

    # 2. Iterate Topics
    for _, tname in TOPICS.items():
        topic_dir = os.path.join(BASE_DIR, tname)
        raw_path = os.path.join(topic_dir, "Raw_Works.jsonl")
        
        if not os.path.exists(raw_path):
            print(f"Skipping: {tname}")
            continue

        # Setup Topic Writers
        output_dir = os.path.join(topic_dir, "OD_Build")
        os.makedirs(output_dir, exist_ok=True)
        
        f_topic_net = open(os.path.join(output_dir, "OD_Network.jsonl"), "w", encoding="utf-8")
        f_topic_rich = open(os.path.join(output_dir, "Works_Enriched.jsonl"), "w", encoding="utf-8")
        path_topic_base = os.path.join(output_dir, "Country_Baseline.jsonl")
        
        topic_stats = defaultdict(lambda: {"score": 0.0, "is_south": None, "count": 0})
        
        print(f"[Processing] {tname}")

        with open(raw_path, "r", encoding="utf-8") as f_in:
            for line in f_in:
                try:
                    work = json.loads(line)
                except: continue
                
                wid = work.get("id")
                year = work.get("publication_year")

                # --- Extract Metadata (Enriched) ---
                sdgs = [s['display_name'] for s in (work.get('sustainable_development_goals') or [])]
                concepts = [{"name": c['display_name'], "score": c['score']} for c in (work.get('concepts') or [])[:5]]
                grants = [g.get('funder_display_name') or g.get('funder') for g in (work.get('grants') or []) if g.get('funder')]
                
                primary_loc = work.get("primary_location") or {}
                source = primary_loc.get("source") or {}
                journal = source.get("display_name")
                
                rich_data = json.dumps({
                    "work_id": wid,
                    "title": work.get("title"),
                    "year": year,
                    "doi": work.get("doi"),
                    "cited_by": work.get("cited_by_count"),
                    "fwci": work.get("fwci"),
                    "is_oa": work.get("open_access", {}).get("is_oa"),
                    "journal": journal,
                    "sdgs": sdgs,
                    "concepts": concepts,
                    "funders": grants
                }, ensure_ascii=False)

                # --- Build Network (Full Mesh / All Pairs) ---
                edges = []
                country_map = {} # {code: is_south}
                
                # Extract all unique countries from all authors
                authorships = work.get("authorships") or []
                for auth in authorships:
                    insts = auth.get("institutions") or []
                    if not insts: continue
                    inst = insts[0]
                    cc = inst.get("country_code")
                    if cc:
                        country_map[cc] = inst.get("is_global_south")
                
                # Sort to ensure consistent A-B order (e.g. always CN-US, never US-CN)
                unique_countries = sorted(list(country_map.keys()))
                n = len(unique_countries)
                
                if n > 0:
                    # 1. Update Node Stats (Score = 1/N)
                    for cc in unique_countries:
                        is_south = country_map[cc]
                        score_contrib = 1.0 / n
                        
                        # Topic Level
                        topic_stats[cc]["score"] += score_contrib
                        topic_stats[cc]["count"] += 1
                        if topic_stats[cc]["is_south"] is None: topic_stats[cc]["is_south"] = is_south
                        
                        # Global Level (Calculated later to handle duplicates properly)
                    
                    # 2. Generate Edges (Combinations of 2)
                    if n > 1:
                        # Total edges in complete graph = N * (N-1) / 2
                        num_edges = n * (n - 1) / 2
                        weight = 1.0 / num_edges  # Sum of weights = 1.0
                        
                        # Generate all unique pairs
                        for c1, c2 in itertools.combinations(unique_countries, 2):
                            s1 = country_map[c1]
                            s2 = country_map[c2]
                            
                            edges.append(json.dumps({
                                "work_id": wid,
                                "year": year,
                                "source": c1, # Alphabetically first
                                "target": c2, # Alphabetically second
                                "weight": weight,
                                "interaction": get_interaction_type(s1, s2)
                            }, ensure_ascii=False))

                # --- Write to Files ---
                
                # Topic Files
                f_topic_rich.write(rich_data + "\n")
                for e in edges:
                    f_topic_net.write(e + "\n")

                # Global Files (Deduplicated)
                if wid not in seen_work_ids:
                    f_total_rich.write(rich_data + "\n")
                    for e in edges:
                        f_total_net.write(e + "\n")
                    
                    # Update Global Stats (Only for new unique papers)
                    if n > 0:
                        for cc in unique_countries:
                            global_stats[cc]["score"] += 1.0 / n
                            global_stats[cc]["count"] += 1
                            if global_stats[cc]["is_south"] is None: 
                                global_stats[cc]["is_south"] = country_map[cc]
                    
                    seen_work_ids.add(wid)

        # Cleanup Topic
        f_topic_net.close()
        f_topic_rich.close()
        
        if topic_stats:
            with open(path_topic_base, "w", encoding="utf-8") as f_base:
                for cc, val in sorted(topic_stats.items(), key=lambda x: -x[1]["score"]):
                    f_base.write(json.dumps({
                        "country": cc,
                        "score": val["score"],
                        "count": val["count"],
                        "region": get_region_label(val["is_south"])
                    }, ensure_ascii=False) + "\n")
        
        print(f"   Saved: .../{tname}/OD_Build")

    # Cleanup Global
    if global_stats:
        with open(path_total_base, "w", encoding="utf-8") as f_base:
            for cc, val in sorted(global_stats.items(), key=lambda x: -x[1]["score"]):
                f_base.write(json.dumps({
                    "country": cc,
                    "score": val["score"],
                    "count": val["count"],
                    "region": get_region_label(val["is_south"])
                }, ensure_ascii=False) + "\n")
    
    f_total_net.close()
    f_total_rich.close()
    
    print(f"\n[Done] Processed {len(seen_work_ids)} unique works.")
    print(f"Merged output: {os.path.abspath(TOTAL_DIR)}")

# --- Run ---
process_all_data()

Root Path: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary_V2\data_origin\OpenAlex_paper
Total Analysis Path: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary_V2\data_origin\OpenAlex_paper\Total_Data_Analysis

[Processing] Climate_Change_Policy_and_Economics
   Saved: .../Climate_Change_Policy_and_Economics/OD_Build
[Processing] Sustainable_Development_and_Env_Policy
   Saved: .../Sustainable_Development_and_Env_Policy/OD_Build
[Processing] Climate_Adaptation_and_Migration
   Saved: .../Climate_Adaptation_and_Migration/OD_Build
[Processing] Climate_Communication_and_Perception
   Saved: .../Climate_Communication_and_Perception/OD_Build

[Done] Processed 468730 unique works.
Merged output: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary_V2\data_origin\OpenAlex_paper\Total_Data_Analysis


#### 归一化及矩阵

In [1]:
import pandas as pd
import numpy as np
import json
import os

# ================= Configuration =================
JSON_PATH = '../country_list.json'
BASE_DIR = '../../data_origin/OpenAlex_paper'
OUTPUT_DIR = '../../data'

TOPICS = {
    "T10471": "Climate_Change_Policy_and_Economics",
    "T12013": "Sustainable_Development_and_Env_Policy",
    "T12656": "Climate_Adaptation_and_Migration",
    "T11488": "Climate_Communication_and_Perception"
}

YEAR_START, YEAR_END = 2005, 2023

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ================= Logic =================

def get_iso_mapping(json_path: str) -> dict:
    """
    Returns a dictionary mapping ISO-2 (OpenAlex format) to ISO-3 (Target format).
    Example: {'CN': 'CHN', 'US': 'USA'}
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        return {item['iso_2']: item['iso'] for item in json.load(f)['countries']}

def generate_matrix(input_path: str, output_path: str, iso_map: dict) -> None:
    """
    Reads OD Network (ISO-2), calculates normalized matrix, 
    converts headers to ISO-3, and saves as CSV.
    """
    if not os.path.exists(input_path):
        print(f"Skipping (File not found): {input_path}")
        return

    # 1. Load Data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f if line.strip()]
    
    if not data:
        print(f"Skipping (Empty file): {input_path}")
        return

    df = pd.DataFrame(data)
    
    # Target keys for filtering are the ISO-2 codes
    target_iso2 = list(iso_map.keys())

    # 2. Filter Scope (using ISO-2)
    df = df[
        (df['year'] >= YEAR_START) & 
        (df['year'] <= YEAR_END) & 
        (df['source'].isin(target_iso2)) & 
        (df['target'].isin(target_iso2))
    ]

    if df.empty:
        print(f"Skipping (No valid data): {input_path}")
        return

    # 3. Aggregate (Sum weights per pair per year)
    df_agg = df.groupby(['source', 'target', 'year'])['weight'].sum().reset_index()

    # 4. Symmetrize
    df_sym = pd.concat([
        df_agg.rename(columns={'source': 'ccode1', 'target': 'ccode2'}),
        df_agg.rename(columns={'source': 'ccode2', 'target': 'ccode1'})
    ], ignore_index=True)
    
    # Consolidate sums
    df_sym = df_sym.groupby(['ccode1', 'ccode2', 'year'])['weight'].sum().reset_index()

    # 5. Log Transformation & Yearly Normalization
    # 在归一化之前进行 log1p 操作
    df_sym['weight'] = np.log1p(df_sym['weight'])

    grouped = df_sym.groupby('year')['weight']
    min_vals = grouped.transform('min')
    max_vals = grouped.transform('max')
    denominator = max_vals - min_vals

    df_sym['norm_score'] = np.where(
        denominator == 0, 
        0.0, 
        (df_sym['weight'] - min_vals) / denominator
    )

    # 6. Average over years
    df_final = df_sym.groupby(['ccode1', 'ccode2'])['norm_score'].mean().reset_index()

    # 7. Pivot & Polish
    matrix = df_final.pivot(index='ccode1', columns='ccode2', values='norm_score')
    
    # [CRITICAL] Reindex using ISO-2 first to align with data
    matrix = matrix.reindex(index=target_iso2, columns=target_iso2).fillna(0)
    
    # [CRITICAL] Map Index and Columns from ISO-2 to ISO-3
    matrix.index = matrix.index.map(iso_map)
    matrix.columns = matrix.columns.map(iso_map)
    
    # Final cleanup (Diagonal 0, Sort by ISO-3)
    np.fill_diagonal(matrix.values, 0)
    matrix.sort_index(axis=0, inplace=True)
    matrix.sort_index(axis=1, inplace=True)

    # 8. Save
    matrix.to_csv(output_path)
    print(f"Saved: {os.path.basename(output_path)} (Shape: {matrix.shape})")

# ================= Execution =================

# Load Map: {'CN': 'CHN', ...}
iso_map = get_iso_mapping(JSON_PATH)

# 1. Process Individual Topics
for tid, tname in TOPICS.items():
    in_path = os.path.join(BASE_DIR, tname, "OD_Build", "OD_Network.jsonl")
    filename = f"1-7-{tid}-Paper_collab-{tname}.csv"
    out_path = os.path.join(OUTPUT_DIR, filename)
    
    generate_matrix(in_path, out_path, iso_map)

# 2. Process Total Analysis
total_in_path = os.path.join(BASE_DIR, "Total_Data_Analysis", "OD_Network.jsonl")
total_out_path = os.path.join(OUTPUT_DIR, "1-7-All-Paper_collab-Total.csv")

generate_matrix(total_in_path, total_out_path, iso_map)

Saved: 1-7-T10471-Paper_collab-Climate_Change_Policy_and_Economics.csv (Shape: (49, 49))
Saved: 1-7-T12013-Paper_collab-Sustainable_Development_and_Env_Policy.csv (Shape: (49, 49))
Saved: 1-7-T12656-Paper_collab-Climate_Adaptation_and_Migration.csv (Shape: (49, 49))
Saved: 1-7-T11488-Paper_collab-Climate_Communication_and_Perception.csv (Shape: (49, 49))
Saved: 1-7-All-Paper_collab-Total.csv (Shape: (49, 49))
