# Province-Level Yield Statistics of China

## Data access
The data is provided by National Bureau of Statistics of China and can be freely accessed at https://data.stats.gov.cn

## Crops

- Rice (rice, rice_doublecropping_late, rice_early, rice_midseason_singlecropping_late)
- Wheat (wheat, spring_wheat, winter_wheat)
- Corn

In [151]:
selected_crop = ['RICE', 'RICE_DOUBLECROPPING_LATE', 'RICE_EARLY', 'RICE_MIDSEASON_SINGLECROPPING_LATE', 
                 'WHEAT', 'SPRING_WHEAT', 'WINTER_WHEAT', 'CORN']

## Export yield statistics for selected crops

- for year data, it is assummed to be harvest period and not planting.

In [152]:
import pandas as pd
import os

data_path = '/app/dev/AgML/china'

china_provinces = ['Beijing', 'Tianjin', 'Hebei', 'Shanxi', 'Inner Mongolia',
       'Liaoning', 'Jilin', 'Heilongjiang', 'Shanghai', 'Jiangsu',
       'Zhejiang', 'Anhui', 'Fujian', 'Jiangxi', 'Shandong', 'Henan',
       'Hubei', 'Hunan', 'Guangdong', 'Guangxi', 'Hainan', 'Chongqing',
       'Sichuan', 'Guizhou', 'Yunnan', 'Tibet', 'Shaanxi', 'Gansu',
       'Qinghai', 'Ningxia', 'Xinjiang']

In [157]:

def restructure_df(data_path, china_provinces, selected_crop):
    
    list_df = []
    
    for crop in selected_crop:
        
        # load yield, production and area
        crop_yield_df = pd.read_csv(os.path.join(data_path, 'yield_{}.csv'.format(crop.lower())), 
                                    encoding = 'unicode_escape', engine ='python', skiprows=3)
        crop_area_df = pd.read_csv(os.path.join(data_path, 'sownarea_{}.csv'.format(crop.lower())), 
                                    encoding = 'unicode_escape', engine ='python', skiprows=3)
        crop_production_df = pd.read_csv(os.path.join(data_path, 'production_{}.csv'.format(crop.lower())), 
                                    encoding = 'unicode_escape', engine ='python', skiprows=3)
        
        # melt the dataframe
        crop_yield_df = pd.melt(crop_yield_df, id_vars=['Region'], var_name='year', value_name='yield')
        crop_area_df = pd.melt(crop_area_df, id_vars=['Region'], var_name='year', value_name='planted_area')
        crop_production_df = pd.melt(crop_production_df, id_vars=['Region'], var_name='year', value_name='production')

        # filter valid provinces values
        crop_yield_df = crop_yield_df[crop_yield_df['Region'].isin(china_provinces)]
        
        # join production and area to yield
        crop_yield_df = pd.merge(crop_yield_df, crop_area_df, on=['Region', 'year'], how='left')
        crop_yield_df = pd.merge(crop_yield_df, crop_production_df, on=['Region', 'year'], how='left')
        
        # assign crop name and country
        crop_yield_df['crop_name'] = crop
        crop_yield_df['country_code'] = 'CHN'
        crop_yield_df =  crop_yield_df.rename(columns={'Region':'adm_id', 'year':'harvest_year'})
        crop_yield_df['adm_id'] = crop_yield_df['country_code'] +'-'+crop_yield_df['adm_id']
        
        list_df.append(crop_yield_df)
        

    # concatenate df for several crops
    master_df = pd.concat(list_df, axis=0)
    
    # remove nan in yield and values less than 0
    master_df = master_df.dropna(subset=['yield'])
    master_df = master_df[master_df['yield'] > 0]
    
    # conform to agml standard
    master_df['season_name'] = 'N/A'
    master_df['planting_year'] = 'N/A'
    master_df['planting_date'] = 'N/A'
    master_df['harvest_date'] = 'N/A'
    
    # scale production and area according to multiply value provided with data
    master_df['planted_area'] = master_df['planted_area'] * 1000
    master_df['production'] = master_df['production'] * 10000

    # convert yield from kg/ha to mt/ha
    master_df['yield'] = master_df['yield'] / 1000

    # assign data source
    master_df['source'] = 'https://data.stats.gov.cn'   

    # reorder to conform to agml standard
    master_df = master_df.reindex(columns=['crop_name', 'country_code', 'adm_id', 'season_name', 
                                          'planting_year', 'planting_date', 'harvest_year', 'harvest_date',\
                                          'yield', 'production', 'planted_area', 'harvest_area', 'source'])
    
    return master_df

In [158]:
master_df = restructure_df(data_path, china_provinces, selected_crop)
master_df.head(10)

Unnamed: 0,crop_name,country_code,adm_id,season_name,planting_year,planting_date,harvest_year,harvest_date,yield,production,planted_area,harvest_area,source
31,RICE,CHN,CHN-Beijing,,,,2022,,5.06215,2100.0,420.0,,https://data.stats.gov.cn
32,RICE,CHN,CHN-Tianjin,,,,2022,,9.52293,526200.0,55260.0,,https://data.stats.gov.cn
33,RICE,CHN,CHN-Hebei,,,,2022,,6.38113,488800.0,76590.0,,https://data.stats.gov.cn
34,RICE,CHN,CHN-Shanxi,,,,2022,,6.75,14600.0,2160.0,,https://data.stats.gov.cn
35,RICE,CHN,CHN-Inner Mongolia,,,,2022,,7.69526,902100.0,117230.0,,https://data.stats.gov.cn
36,RICE,CHN,CHN-Liaoning,,,,2022,,8.24102,4255600.0,516390.0,,https://data.stats.gov.cn
37,RICE,CHN,CHN-Jilin,,,,2022,,8.17245,6809100.0,833180.0,,https://data.stats.gov.cn
38,RICE,CHN,CHN-Heilongjiang,,,,2022,,7.54714,27180000.0,3601370.0,,https://data.stats.gov.cn
39,RICE,CHN,CHN-Shanghai,,,,2022,,7.97778,827300.0,103700.0,,https://data.stats.gov.cn
40,RICE,CHN,CHN-Jiangsu,,,,2022,,8.9655,19916100.0,2221420.0,,https://data.stats.gov.cn


## Save the data

In [161]:
# master_df.to_csv('/app/dev/AgML/' + "YIELD_CHINA.csv")

## Summary by crop

In [160]:
summary = {'CROP':[],
           'MIN_YEAR':[],
           'MAX_YEAR':[],
           'NUM_YEARS':[],
           'NUM_DISTRICTS':[],
           'DATA_SIZE':[],
           'NAN_IN_YIELD':[],
           'LESS_THAN_ZERO_YIELD':[]
          }

for crop in selected_crop:
    
    crop_yield_df = master_df[master_df['crop_name'] == crop]
    list_years = sorted(crop_yield_df["harvest_year"].unique())
    data_size = crop_yield_df["yield"].count()
    nan_counts = crop_yield_df['yield'].isnull().sum()
    district_counts = len(crop_yield_df['adm_id'].unique())
    less_than_zero = len(crop_yield_df[crop_yield_df['yield'] <= 0 ])
    
    summary['CROP'].append(crop)
    summary['MIN_YEAR'].append(min(list_years))
    summary['MAX_YEAR'].append(max(list_years))
    summary['NUM_YEARS'].append(len(list_years))
    summary['NUM_DISTRICTS'].append(district_counts)
    summary['DATA_SIZE'].append(crop_yield_df.shape[0])
    summary['NAN_IN_YIELD'].append(nan_counts)
    summary['LESS_THAN_ZERO_YIELD'].append(less_than_zero)

district_summary = pd.DataFrame(summary)
district_summary = district_summary.sort_values(by='CROP')
district_summary.head(25)

Unnamed: 0,CROP,MIN_YEAR,MAX_YEAR,NUM_YEARS,NUM_DISTRICTS,DATA_SIZE,NAN_IN_YIELD,LESS_THAN_ZERO_YIELD
7,CORN,1990,2022,33,31,998,0,0
0,RICE,1990,2022,33,30,983,0,0
1,RICE_DOUBLECROPPING_LATE,1990,2022,33,15,405,0,0
2,RICE_EARLY,1990,2022,33,15,382,0,0
3,RICE_MIDSEASON_SINGLECROPPING_LATE,1990,2022,33,30,924,0,0
5,SPRING_WHEAT,1990,2018,28,21,407,0,0
4,WHEAT,1990,2022,33,30,981,0,0
6,WINTER_WHEAT,1990,2022,33,30,821,0,0
