# Data collection 1: Chinese Stack Exchange

**Contents**
1. [Experiment and understand API](#1-experiment-and-understand-api)
2. [Collect Chinese stack exchange data](#2-collect-chinese-stack-exchange-data)

In [6]:

# Import libraries
import requests
import pandas as pd
import numpy as np
import time
import datetime
import private.config as config

## 1. Experiment and understand API

In [7]:
# define API endpoint
def get_stack_exchange_data(url):
    response = requests.get(url)
    # check if the response is successful
    if response.status_code != 200:
        raise ValueError(f"Invalid response: {response.status_code}")
    return response.json()

stack_exchange_url = "https://api.stackexchange.com/2.3/questions?site=chinese"
data_stack_ex = get_stack_exchange_data(stack_exchange_url)

In [8]:
# understand the structure of the data
print(data_stack_ex.keys())

dict_keys(['items', 'has_more', 'quota_max', 'quota_remaining'])


In [9]:
data_stack_ex['items'][0]

{'tags': ['number'],
 'owner': {'account_id': 40155193,
  'reputation': 11,
  'user_id': 43136,
  'user_type': 'unregistered',
  'profile_image': 'https://www.gravatar.com/avatar/dabda9566ce28a8e089c6e1773dbad65?s=256&d=identicon&r=PG&f=y&so-version=2',
  'display_name': 'xiaoli',
  'link': 'https://chinese.stackexchange.com/users/43136/xiaoli'},
 'is_answered': True,
 'view_count': 37,
 'answer_count': 3,
 'score': 0,
 'last_activity_date': 1739566729,
 'creation_date': 1739493399,
 'last_edit_date': 1739513897,
 'question_id': 59797,
 'content_license': 'CC BY-SA 4.0',
 'link': 'https://chinese.stackexchange.com/questions/59797/how-do-you-say-46-66',
 'title': 'How do you say 46.66%?'}

In [10]:
data_stack_ex['items'][0].keys()

dict_keys(['tags', 'owner', 'is_answered', 'view_count', 'answer_count', 'score', 'last_activity_date', 'creation_date', 'last_edit_date', 'question_id', 'content_license', 'link', 'title'])

In [11]:
print('data_stack_ex[items][0][tags]')
print(data_stack_ex['items'][0]['tags'])
print('data_stack_ex[items][0][title]')
print(data_stack_ex['items'][0]['title'])

data_stack_ex[items][0][tags]
['number']
data_stack_ex[items][0][title]
How do you say 46.66%?


In [12]:
print('data_stack_ex[items] number: ')
print(len(data_stack_ex['items']))

data_stack_ex[items] number: 
30


In [16]:
# Test the API key
API_KEY = config.STACK_EXCHANGE_API_KEY  
TEST_URL = f"https://api.stackexchange.com/2.3/info?site=chinese&key={API_KEY}"

response = requests.get(TEST_URL)
print(response.status_code, response.json())

200 {'items': [{'new_active_users': 0, 'total_users': 32169, 'badges_per_minute': 0.01, 'total_badges': 41298, 'total_votes': 109854, 'total_comments': 58940, 'answers_per_minute': 0.0, 'questions_per_minute': 0.0, 'total_answers': 30128, 'total_accepted': 7062, 'total_unanswered': 191, 'total_questions': 12036, 'api_revision': '2025.2.12.45337'}], 'has_more': False, 'quota_max': 300, 'quota_remaining': 164}


## 2. Collect Chinese Stack Exchange data

In [None]:
# Base URL for Stack Exchange API (Chinese Stack Exchange)
BASE_URL = "https://api.stackexchange.com/2.3"
API_KEY = config.STACK_EXCHANGE_API_KEY  

def fetch_questions_with_tags(site="chinese", page_size=100, retries=3):
    """
    Fetches all questions with their tags from the Stack Exchange API.

    :param site: The Stack Exchange site (default: "chinese")
    :param page_size: Number of questions per page (max 100)
    :param retries: Number of retry attempts for failed requests
    :return: List of all questions with tags
    """
    questions = []
    page = 1

    while True:
        url = (f"{BASE_URL}/questions?order=desc&sort=activity&site={site}" +
                f"&pagesize={page_size}&page={page}&filter=!nKzQURF6Y5&key={API_KEY}")

        attempt = 0

        while attempt < retries:
            try:
                response = requests.get(url, timeout=20)
                if response.status_code == 200:
                    data = response.json()
                    for item in data.get("items", []):
                        questions.append({
                            "question_id": item["question_id"],
                            "title": item["title"],
                            "tags": ", ".join(item["tags"])
                        })

                    if not data.get("has_more", False):
                        print(f"✅ Finished fetching all questions at page {page}")
                        return questions

                    break  # Break retry loop if successful

                else:
                    print(f"Error fetching page {page} (attempt {attempt+1}): {response.status_code}")
                    return questions  # Return what we have so far

            except requests.exceptions.RequestException as e:
                print(f"Request failed for page {page} (attempt {attempt+1}): {e}")

            attempt += 1
            time.sleep(np.random.randint(3, 10))  # Random sleep to prevent rate limiting

        page += 1  # Move to the next page
        time.sleep(np.random.randint(3, 10))  # Random sleep to prevent rate limiting

    return questions


In [18]:
# Function to run the full data collection pipeline
def run_data_collection_pipeline(site="chinese", page_size=100):
    """
    Runs the data collection pipeline, fetching all questions and saving them to a CSV file.

    :param site: The Stack Exchange site (default: "chinese")
    :param page_size: Number of questions per page (max 100)
    """
    start_time = time.time()
    questions = fetch_questions_with_tags(site=site, page_size=page_size)
    end_time = time.time()

    # Calculate and print the time taken
    elapsed_time = end_time - start_time
    elapsed_str = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    print(f"Data collection completed in {elapsed_str}")

    # Convert to DataFrame and save to CSV
    questions_df = pd.DataFrame(questions)
    questions_df.head()
    file_path = "private/stack_exchange_all_questions.csv"
    # Save the data to a CSV file, and overwrite if it already exists
    questions_df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

# Run the full data collection pipeline
run_data_collection_pipeline(page_size=100)

Error fetching page 26 (attempt 1): 400
Data collection completed in 00:02:30
Data saved to private/stack_exchange_all_questions.csv
