# Urban Ecology Research Trend Analysis

Type: NLP + Time Series + Web Data | Domain: Scientific + environmental | Format: Notebook
- Use PubMed or Semantic Scholar API to extract papers on 'urban ecology'.
- Track number of publications per year.
- Perform keyword frequency and topic modeling.
- Map institutions or authors by location.

In [None]:
import os
import time
import json
import requests
import pandas as pd

BASE_URL = 'http://api.semanticscholar.org/graph/v1/paper/search/bulk'
FIELDS = 'title,year,authors,url'
DELAY = 5  # delay between requests to avoid rate limiting
RETRY_DELAY = 5  # seconds before retrying on failure
OUTPUT_JSONL = 'papers.jsonl'
YEAR_RANGE = ''

query_list = [
    # 'urban ecology',
    # 'urban biodiversity',
    'urban ecosystem',
    # 'urban green spaces',
    # 'urban wildlife',
    # 'urban vegetation',
]

In [None]:
# Run this to clear the .txt progress trackers

for keyword in query_list:
    done_file = f'done_{keyword}.txt'
    token_file = f'token_{keyword}.txt'
    if os.path.exists(done_file):
        os.remove(done_file)
        print(f'✅ Progress trackers for "{keyword}" removed.')
    if os.path.exists(token_file):
        os.remove(token_file)
        print(f'✅ Progress trackers for "{keyword}" removed.')
        
    os.remove(OUTPUT_JSONL)
    print(f'✅ Output file "{OUTPUT_JSONL}" removed.')

✅ Progress trackers for "urban ecology" removed.
✅ Progress trackers for "urban biodiversity" removed.


In [29]:
# === Helper functions ===
# keeps track of keywords and tokens to allow to keep retrieving papers from 
# where you left off in case of a request/response error
def save_token(keyword, token):
    with open(f'token_{keyword}.txt', 'w') as f:
        f.write(token)

def load_token(keyword):
    filename = f'token_{keyword}.txt'
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return f.read().strip()
    return None

def mark_done(keyword):
    with open(f'done_{keyword}.txt', 'w') as f:
        f.write('completed')

def is_done(keyword):
    return os.path.exists(f'done_{keyword}.txt')

def delete_token(keyword):
    filename = f'token_{keyword}.txt'
    if os.path.exists(filename):
        os.remove(filename)
        
def create_df_from_jsonl(jsonl_path):
    if os.path.exists(jsonl_path):
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            papers = [json.loads(line) for line in f]

        df = pd.json_normalize(papers)

        if 'paperId' in df.columns:
            df = df.drop_duplicates(subset='paperId')
            print(f'📌 Deduplicated. Final count: {len(df)} unique papers.')
        else:
            print('⚠️ Warning: No paperId field found to deduplicate.')
        
        return df
    else:
        print('⚠️ No data found. Make sure the JSONL file exists.')


In [None]:
# === Main loop over keywords ===

for keyword in query_list:
    print(f'\n🔍 Processing keyword: "{keyword}"')

    if is_done(keyword):
        print(f'✅ Keyword "{keyword}" already completed. Skipping.')
        continue

    token = load_token(keyword)
    if token:
        print(f'🔄 Resuming from saved token for "{keyword}": {token}')
    else:
        print(f'⏳ Starting fresh for keyword "{keyword}"')

    retrieved = 0

    while True:
        params = {
            'query': keyword,
            'fields': FIELDS,
            'limit': 1000
        }
        if YEAR_RANGE:
            params['year'] = YEAR_RANGE
        if token:
            params['token'] = token

        # Retry logic
        for attempt in range(2):
            try:
                response = requests.get(BASE_URL, params=params, timeout=15)
                response.raise_for_status()
                data = response.json()
                break
            except requests.exceptions.RequestException as e:
                print(f'❌ Request error on attempt {attempt+1} for "{keyword}": {e}')
                if attempt == 0:
                    print(f'⏳ Retrying after {RETRY_DELAY} seconds...')
                    time.sleep(RETRY_DELAY)
                else:
                    print('⚠️ Skipping this batch due to repeated failure.')
                    data = None

        if data is None:
            print(f'⚠️ No data retrieved for keyword "{keyword}", breaking loop.')
            break

        papers = data.get('data', [])
        if not papers:
            print(f'⚠️ No papers returned, assuming end of results for "{keyword}"')
            break

        retrieved += len(papers)
        print(f'📄 Retrieved {retrieved} papers so far for "{keyword}"')

        with open(OUTPUT_JSONL, 'a', encoding='utf-8') as f:
            for paper in papers:
                paper['search_keyword'] = keyword
                json.dump(paper, f)
                f.write('\n')

        token = data.get('token')
        if token:
            token = data['token']
            save_token(keyword, token)
            time.sleep(DELAY)
        else:
            print(f'✅ Completed all pages for "{keyword}"')
            delete_token(keyword)
            mark_done(keyword)
            break

print('\n🎉 All keywords processed.')

df = create_df_from_jsonl(OUTPUT_JSONL)


🔍 Processing keyword: "urban ecology"
⏳ Starting fresh for keyword "urban ecology"
📄 Retrieved 1000 papers so far for "urban ecology"
{
  "paperId": "001dd10b968cbb3ef419682e973e6ad4f59ec484",
  "url": "https://www.semanticscholar.org/paper/001dd10b968cbb3ef419682e973e6ad4f59ec484",
  "title": "The value of social media wildlife sightings for elusive species monitoring: a population assessment of servals in a South African urban nature reserve",
  "year": 2025,
  "authors": [
    {
      "authorId": "2334898251",
      "name": "Kyle Smith"
    },
    {
      "authorId": "2281395083",
      "name": "M. J. Somers"
    }
  ],
  "search_keyword": "urban ecology"
}
{
  "paperId": "0026966aaf36e2d8dbc731484a994aac4f10cbdb",
  "url": "https://www.semanticscholar.org/paper/0026966aaf36e2d8dbc731484a994aac4f10cbdb",
  "title": "Evaluation of Urban Space Livability in the Urban Area of Hefei Based on Production-Living-Ecological Space",
  "year": 2024,
  "authors": [
    {
      "authorId": "22

In [23]:
df['search_keyword'].value_counts()

Series([], Name: count, dtype: int64)