In [29]:
import requests
import json
import pandas as pd
import os
import time
from tqdm import tqdm

In [24]:
# Define the start and end dates for the scraping
start_date = pd.Timestamp('2015-01-01')
end_date = pd.Timestamp.now()
output_folder = 'river_report/'
output_path = os.path.join(os.getcwd(), output_folder)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [31]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Create a dictionary to store the scraped data for each station
station_data = {}

# loop over each date in the range
for date in tqdm(pd.date_range(start_date, end_date, freq='D')):
    # create the URL
    url = f'http://113.57.190.228:8001/Web/Report/GetRiverData?date={date.date()}+08%3A00'
    for i in range(3):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            break
        except requests.exceptions.RequestException as e:
            print(f'Request failed for {url}: {e}')
            if i == 2:
                print(f'Failed to get data for {date_str} after {i+1} retries, skipping...')
                continue
            print(f'Retrying in 5 seconds...')
            time.sleep(5)
    # make the request
    #response = requests.get(url)
    # extract the JSON data
    data = json.loads(response.text)
    # loop over each row in the data
    for row in data['rows']:
        # get the station name
        station_name = row['STNM']
        # create a dictionary for the station if it doesn't exist
        if station_name not in station_data:
            station_data[station_name] = {}
        # add the data for the station on this date
        station_data[station_name][date.date()] = {'河名': row['RVNM'], 
                                                   '水位': row['Z'],
                                                   '水势': row['WPTN'],
                                                   '比昨日+涨-落': row['YZ'],
                                                   '流量': row['Q'],
                                                   '设防水位': row['FRZ'],
                                                   '警戒水位': row['WRZ'],
                                                   '保证水位': row['GRZ']
                                                  }
        
        # get the second station name
        station_name_1 = row['STNM1']
        # create a dictionary for the station if it doesn't exist
        if station_name_1 not in station_data:
            station_data[station_name_1] = {}
        # add the data for the station on this date
        station_data[station_name_1][date.date()] = {'河名': row['RVNM1'], 
                                                   '水位': row['Z1'],
                                                   '水势': row['WPTN1'],
                                                   '比昨日+涨-落': row['YZ1'],
                                                   '流量': row['Q1'],
                                                   '设防水位': row['FRZ1'],
                                                   '警戒水位': row['WRZ1'],
                                                   '保证水位': row['GRZ1']
                                                  }

# convert the dictionary to a DataFrame
df_dict = {}
for station, data in station_data.items():
    df_dict[station] = pd.DataFrame.from_dict(data, orient='index')
    
# save each DataFrame to a separate CSV file
for station, df in df_dict.items():
    outpath = os.path.join(output_path, f'{station}.csv')
    df.to_csv(outpath)


100%|██████████████████████████████████████████████████████████████████████████████| 3055/3055 [32:49<00:00,  1.55it/s]
