# District-level crop statistics of India

## Data access
The data is provided by ICRISAT and can be freely accessed at http://data.icrisat.org/dld/src/crops.html 

## Data exploration

In [3]:
import pandas as pd
import os

data_path = '/app/dev/AgML'
filename = 'ICRISAT-District Level Data.csv'

crop_stats_df = pd.read_csv(os.path.join(data_path, filename))
crop_stats_df.head()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),WHEAT AREA (1000 ha),WHEAT PRODUCTION (1000 tons),...,SUGARCANE YIELD (Kg per ha),COTTON AREA (1000 ha),COTTON PRODUCTION (1000 tons),COTTON YIELD (Kg per ha),FRUITS AREA (1000 ha),VEGETABLES AREA (1000 ha),FRUITS AND VEGETABLES AREA (1000 ha),POTATOES AREA (1000 ha),ONION AREA (1000 ha),FODDER AREA (1000 ha)
0,1,1990,14,Chhattisgarh,Durg,397.9,481.4,1210.0,18.2,13.4,...,0.0,0.0,0.0,0.0,2.18,9.61,11.79,0.03,0.05,-1.0
1,1,1991,14,Chhattisgarh,Durg,393.2,508.6,1293.0,18.3,11.8,...,1250.0,0.0,0.0,0.0,2.41,9.93,12.34,0.04,0.15,-1.0
2,1,1992,14,Chhattisgarh,Durg,398.4,514.5,1291.0,17.1,10.7,...,1667.0,0.01,0.0,0.0,2.23,10.85,13.07,0.03,0.26,-1.0
3,1,1993,14,Chhattisgarh,Durg,410.2,569.1,1387.0,17.0,12.1,...,2500.0,0.03,0.0,0.0,1.99,11.17,13.16,0.04,0.27,-1.0
4,1,1994,14,Chhattisgarh,Durg,430.1,601.7,1399.0,17.5,14.2,...,1000.0,0.0,0.0,0.0,2.07,11.02,13.09,0.03,0.22,-1.0


## Summary by crop

only the crops below have yield statistics

In [4]:
selected_crop = [
'KHARIF SORGHUM', 'RABI SORGHUM' , 'SORGHUM', 'PEARL MILLET', 'MAIZE', 
'FINGER MILLET', 'BARLEY', 'CHICKPEA', 'PIGEONPEA', 'MINOR PULSES', 
'GROUNDNUT', 'SESAMUM', 'RAPESEED AND MUSTARD', 'SAFFLOWER', 'CASTOR', 
'LINSEED', 'SUNFLOWER', 'RICE', 'SOYABEAN', 'OILSEEDS', 'SUGARCANE', 'COTTON', 'WHEAT']

summary = {'CROP':[],
           'MIN_YEAR':[],
           'MAX_YEAR':[],
           'NUM_YEARS':[],
           'NUM_DISTRICTS':[],
           'DATA_SIZE':[],
           'NAN_IN_YIELD':[],
           'LESS_THAN_ZERO_YIELD':[]
          }

for crop in selected_crop:
    
    crop_yield_df = crop_stats_df[['Year', crop+' YIELD (Kg per ha)', 'Dist Name']]
    list_years = sorted(crop_yield_df["Year"].unique())
    data_size = crop_yield_df["Year"].count()
    nan_counts = crop_yield_df[crop+' YIELD (Kg per ha)'].isnull().sum()
    district_counts = len(crop_yield_df['Dist Name'].unique())
    less_than_zero = len(crop_yield_df[crop_yield_df[crop+' YIELD (Kg per ha)'] <= 0 ])
    
    summary['CROP'].append(crop)
    summary['MIN_YEAR'].append(list_years[0])
    summary['MAX_YEAR'].append(list_years[-1])
    summary['NUM_YEARS'].append(len(list_years))
    summary['NUM_DISTRICTS'].append(district_counts)
    summary['DATA_SIZE'].append(crop_yield_df.shape[0])
    summary['NAN_IN_YIELD'].append(nan_counts)
    summary['LESS_THAN_ZERO_YIELD'].append(less_than_zero)

district_summary = pd.DataFrame(summary)
district_summary = district_summary.sort_values(by='CROP')
district_summary.head(25)

Unnamed: 0,CROP,MIN_YEAR,MAX_YEAR,NUM_YEARS,NUM_DISTRICTS,DATA_SIZE,NAN_IN_YIELD,LESS_THAN_ZERO_YIELD
6,BARLEY,1990,2017,28,580,13965,0,7856
14,CASTOR,1990,2017,28,580,13965,0,9706
7,CHICKPEA,1990,2017,28,580,13965,0,2755
21,COTTON,1990,2017,28,580,13965,0,8247
5,FINGER MILLET,1990,2017,28,580,13965,0,9516
10,GROUNDNUT,1990,2017,28,580,13965,0,4773
0,KHARIF SORGHUM,1990,2017,28,580,13965,0,6276
15,LINSEED,1990,2017,28,580,13965,0,7986
4,MAIZE,1990,2017,28,580,13965,0,1890
9,MINOR PULSES,1990,2017,28,580,13965,0,1533


## Export yield statistics for selected crops

In [25]:
def restructure_df(csv_path, selected_crops):
    
    """
    params:
    csv_path(str): india crop statistics file path
    selected_crops(list): list of crops for which a
                          master dataframe will be created
    
    returns
    a dataframe containing yield information 
    at district level
    
    """
    
    assert len(selected_crops) is not None, "no crop(s) provided"
    
    # read csv
    df = pd.read_csv(csv_path)
    
    # additional columns
    req_columns = ['Year', 'State Code', 'Dist Code']
    

    # create master df
    if len(selected_crops) > 1:
        list_df = []
        
        for crop in selected_crops:
            crop_df = df[req_columns + [crop+' YIELD (Kg per ha)', crop+' AREA (1000 ha)', \
                                       crop+' PRODUCTION (1000 tons)']]
                         
            # rename fields
            crop_df =  crop_df.rename(columns={crop+' YIELD (Kg per ha)':'yield',
                                              crop+' AREA (1000 ha)':'harvest_area',
                                              crop+' PRODUCTION (1000 tons)':'production',
                                              'Year': 'harvest_year'})
            
            # set crop name
            crop_df['crop_name'] = crop
            
            list_df.append(crop_df)
        
        # concatenate df for several crops
        master_df = pd.concat(list_df, axis=0)
                           
    else:
        # activate if one crop selected
        master_df = df[req_columns + [selected_crops[0]+' YIELD (Kg per ha)', 
                                      selected_crops[0]+' AREA (1000 ha)', 
                                      selected_crops[0]+' PRODUCTION (1000 tons)']]
                                     
        master_df =  master_df.rename(columns={selected_crops[0]+' YIELD (Kg per ha)':'yield',
                                               selected_crops[0]+' AREA (1000 ha)':'harvest_area',
                                               selected_crops[0]+' PRODUCTION (1000 tons)':'production',
                                               'Year': 'harvest_year'})
        master_df['crop_name'] = selected_crops[0]
    
    # add columns to conform to agml standard
    master_df['country_code'] = 'IN'
    master_df['season_name'] = 'N/A'
    master_df['planting_year'] = 'N/A'
    master_df['planting_date'] = 'N/A'
    master_df['planted_area'] = 'N/A'
    master_df['harvest_date'] = 'N/A'
    # master_df['source'] = 'http://data.icrisat.org/dld/src/crops.html'
    
    # scale production and area according to multiply value in column description
    master_df['harvest_area'] = master_df['harvest_area'] * 1000
    master_df['production'] = master_df['production'] * 1000
    
    # convert yield in kg/ha to mt/ha
    master_df['yield'] = master_df['yield'] / 1000
    
    
    """
    create admin id format CCC-SS-DDDD (country, state, district code)
    state code takes up to 2 characters, and 4 for district code
    """
    
    master_df['State Code'] = master_df['State Code'].astype(str).str.zfill(2)
    master_df['Dist Code'] = master_df['Dist Code'].astype(str).str.zfill(4)
    master_df['adm_id'] = master_df['country_code'] +'-'+ master_df['State Code'] +'-'+ master_df['Dist Code']
    
    # drop columns
    master_df.drop(['State Code', 'Dist Code'], axis=1, inplace=True)
    
    # reorder to conform to agml standard
    master_df = master_df.reindex(columns=['crop_name', 'country_code', 'adm_id', 'season_name', 
                                          'planting_year', 'planting_date', 'harvest_year', 'harvest_date',\
                                          'yield', 'production', 'planted_area', 'harvest_area'])
    
    # some records contain negative and 0 yield values
    master_df = master_df[master_df['yield'] > 0]
    
    
    return master_df

## Generate master dataframe

- for year and area data, it is assummed to be harvest period and not planting.
- this has not been specified in the documentation

In [26]:
selected_crops = ["MAIZE", "RICE", "WHEAT"]
yield_df = restructure_df('/app/dev/AgML/ICRISAT-District Level Data.csv', selected_crops)  

yield_df.head(10)

Unnamed: 0,crop_name,country_code,adm_id,season_name,planting_year,planting_date,harvest_year,harvest_date,yield,production,planted_area,harvest_area
0,MAIZE,IN,IN-14-0001,,,,1990,,0.5,100.0,,200.0
1,MAIZE,IN,IN-14-0001,,,,1991,,1.0,100.0,,100.0
2,MAIZE,IN,IN-14-0001,,,,1992,,2.0,200.0,,100.0
3,MAIZE,IN,IN-14-0001,,,,1993,,1.0,100.0,,100.0
4,MAIZE,IN,IN-14-0001,,,,1994,,1.0,200.0,,200.0
5,MAIZE,IN,IN-14-0001,,,,1995,,2.0,200.0,,100.0
6,MAIZE,IN,IN-14-0001,,,,1996,,1.0,100.0,,100.0
7,MAIZE,IN,IN-14-0001,,,,1997,,1.0,100.0,,100.0
8,MAIZE,IN,IN-14-0001,,,,1998,,1.0,100.0,,100.0
9,MAIZE,IN,IN-14-0001,,,,1999,,1.222,220.0,,180.0


## Save the data

In [11]:
# yield_df.to_csv('/app/dev/AgML/' + "YIELD_INDIA.csv", index=False)