# Urban Ecology Research Trend Analysis

Type: NLP + Time Series + Web Data | Domain: Scientific + environmental | Format: Notebook
- Use PubMed or Semantic Scholar API to extract papers on 'urban ecology'.
- Track number of publications per year.
- Perform keyword frequency and topic modeling.
- Map institutions or authors by location.

In [None]:
import os
import time
import json
import requests
import pandas as pd

BASE_URL = "http://api.semanticscholar.org/graph/v1/paper/search/bulk"
FIELDS = 'title,year,authors,abstract,url,openAccessPdf'
DELAY = 5  # delay between requests to avoid rate limiting
RETRY_DELAY = 5  # seconds before retrying on failure
OUTPUT_CSV = "papers.csv"
OUTPUT_JSONL = "papers.jsonl"
YEAR_RANGE = "2020-" 

query_list = [
    'urban ecology',
    'urban biodiversity',
    'urban green spaces',
    'urban wildlife',
    'urban vegetation',
    'urban environmental change',
    'urban landscape ecology',
    'urban ecosystem services'
]

In [None]:
# Run this to clear the .txt progress trackers

for keyword in query_list:
    done_file = f'done_{keyword}.txt'
    token_file = f'token_{keyword}.txt'
    if os.path.exists(done_file):
        os.remove(done_file)
        print(f"✅ Progress trackers for '{keyword}' removed.")
    if os.path.exists(token_file):
        os.remove(token_file)
        print(f"✅ Progress trackers for '{keyword}' removed.")

In [None]:
# === Helper functions ===
# keeps track of keywords and tokens to allow to keep retrieving papers from where you left off
def save_token(keyword, token):
    with open(f'token_{keyword}.txt', 'w') as f:
        f.write(token)

def load_token(keyword):
    filename = f'token_{keyword}.txt'
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return f.read().strip()
    return None

def mark_done(keyword):
    with open(f'done_{keyword}.txt', 'w') as f:
        f.write('completed')

def is_done(keyword):
    return os.path.exists(f'done_{keyword}.txt')

def delete_token(keyword):
    filename = f'token_{keyword}.txt'
    if os.path.exists(filename):
        os.remove(filename)
        
def export_jsonl_to_csv(jsonl_path, csv_path):
    if os.path.exists(jsonl_path):
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            papers = [json.loads(line) for line in f]

        df = pd.json_normalize(papers)

        if "paperId" in df.columns:
            df = df.drop_duplicates(subset="paperId")
            print(f"📌 Deduplicated. Final count: {len(df)} unique papers.")
        else:
            print("⚠️ Warning: No 'paperId' field found to deduplicate.")

        df.to_csv(csv_path, index=False)
        print(f"📁 Saved to {csv_path}")
    else:
        print("⚠️ No data found. Make sure the JSONL file exists.")

In [None]:
# === Main loop over keywords ===

for keyword in query_list:
    print(f"\n🔍 Processing keyword: '{keyword}'")

    if is_done(keyword):
        print(f"✅ Keyword '{keyword}' already completed. Skipping.")
        continue

    token = load_token(keyword)
    if token:
        print(f"🔄 Resuming from saved token for '{keyword}': {token}")
    else:
        print(f"⏳ Starting fresh for keyword '{keyword}'")

    retrieved = 0

    while True:
        params = {
            'query': keyword,
            'fields': FIELDS,
            'limit': 1000,
            'year': YEAR_RANGE
        }
        if token:
            params['token'] = token

        # Retry logic
        for attempt in range(2):
            try:
                response = requests.get(BASE_URL, params=params, timeout=15)
                response.raise_for_status()
                data = response.json()
                break
            except requests.exceptions.RequestException as e:
                print(f"❌ Request error on attempt {attempt+1} for '{keyword}': {e}")
                if attempt == 0:
                    print(f"⏳ Retrying after {RETRY_DELAY} seconds...")
                    time.sleep(RETRY_DELAY)
                else:
                    print("⚠️ Skipping this batch due to repeated failure.")
                    data = None

        if data is None:
            print(f"⚠️ No data retrieved for keyword '{keyword}', breaking loop.")
            break

        papers = data.get('data', [])
        if not papers:
            print(f"⚠️ No papers returned, assuming end of results for '{keyword}'")
            break

        retrieved += len(papers)
        print(f"📄 Retrieved {retrieved} papers so far for '{keyword}'")

        with open(OUTPUT_JSONL, 'a', encoding='utf-8') as f:
            for paper in papers:
                json.dump(paper, f)
                f.write('\n')

        if 'token' in data:
            token = data['token']
            save_token(keyword, token)
            time.sleep(DELAY)
        else:
            print(f"✅ Completed all pages for '{keyword}'")
            delete_token(keyword)
            mark_done(keyword)
            break

print("\n🎉 All keywords processed.")

export_jsonl_to_csv(OUTPUT_JSONL, OUTPUT_CSV)


🔍 Searching for: 'urban ecology'
📄 Retrieved 1000 papers so far for 'urban ecology'
📄 Retrieved 2000 papers so far for 'urban ecology'
📄 Retrieved 3000 papers so far for 'urban ecology'
📄 Retrieved 4000 papers so far for 'urban ecology'
📄 Retrieved 4772 papers so far for 'urban ecology'
📄 Retrieved 5772 papers so far for 'urban ecology'
📄 Retrieved 6772 papers so far for 'urban ecology'
📄 Retrieved 7772 papers so far for 'urban ecology'
📄 Retrieved 8772 papers so far for 'urban ecology'
📄 Retrieved 9544 papers so far for 'urban ecology'
📄 Retrieved 10544 papers so far for 'urban ecology'
📄 Retrieved 11544 papers so far for 'urban ecology'
📄 Retrieved 12544 papers so far for 'urban ecology'
📄 Retrieved 13544 papers so far for 'urban ecology'
📄 Retrieved 14316 papers so far for 'urban ecology'
📄 Retrieved 15316 papers so far for 'urban ecology'
📄 Retrieved 16316 papers so far for 'urban ecology'
📄 Retrieved 17316 papers so far for 'urban ecology'
📄 Retrieved 18316 papers so far for 'urb

KeyboardInterrupt: 

In [8]:
df.shape

(1000, 5)

In [11]:
papers = df.title
df[papers.isin(papers[papers.duplicated()])].sort_values('title')

Unnamed: 0,title,year,url,authors,abstract
21,A large-scale survey of the postmortem human m...,2018.0,https://www.semanticscholar.org/paper/0044b9ad...,Jennifer L. Pechal; Carl J. Schmidt; H. Jordan...,The microbiome plays many roles in human healt...
640,A large-scale survey of the postmortem human m...,2018.0,https://www.semanticscholar.org/paper/092ad2db...,Jennifer L. Pechal; C. Schmidt; H. Jordan; M. ...,The microbiome plays many roles in human healt...
424,Rising incidence of urban floods: understandin...,2020.0,https://www.semanticscholar.org/paper/0648d90b...,Kabila Abass,
437,Rising incidence of urban floods: understandin...,2020.0,https://www.semanticscholar.org/paper/0679df87...,Kabila Abass,
655,Urban ecology,2020.0,https://www.semanticscholar.org/paper/096406e9...,D. Bartmanski; I. Woodward,
851,Urban ecology,2020.0,https://www.semanticscholar.org/paper/0c41a8ac...,Patrick M. Lydon,
916,Urban ecology,2001.0,https://www.semanticscholar.org/paper/0d06355b...,Jianguo Wu,
