# District-level crop statistics of India

### Data access
The data is provided by ICRISAT and can be freely accessed at http://data.icrisat.org/dld/src/crops.html 

### Data exploration

In [124]:
import pandas as pd
import os

data_path = '/app/dev/AgML'
filename = 'ICRISAT-District Level Data.csv'

crop_stats_df = pd.read_csv(os.path.join(data_path, filename))
crop_stats_df.head()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),WHEAT AREA (1000 ha),WHEAT PRODUCTION (1000 tons),...,SUGARCANE YIELD (Kg per ha),COTTON AREA (1000 ha),COTTON PRODUCTION (1000 tons),COTTON YIELD (Kg per ha),FRUITS AREA (1000 ha),VEGETABLES AREA (1000 ha),FRUITS AND VEGETABLES AREA (1000 ha),POTATOES AREA (1000 ha),ONION AREA (1000 ha),FODDER AREA (1000 ha)
0,1,1990,14,Chhattisgarh,Durg,397.9,481.4,1210.0,18.2,13.4,...,0.0,0.0,0.0,0.0,2.18,9.61,11.79,0.03,0.05,-1.0
1,1,1991,14,Chhattisgarh,Durg,393.2,508.6,1293.0,18.3,11.8,...,1250.0,0.0,0.0,0.0,2.41,9.93,12.34,0.04,0.15,-1.0
2,1,1992,14,Chhattisgarh,Durg,398.4,514.5,1291.0,17.1,10.7,...,1667.0,0.01,0.0,0.0,2.23,10.85,13.07,0.03,0.26,-1.0
3,1,1993,14,Chhattisgarh,Durg,410.2,569.1,1387.0,17.0,12.1,...,2500.0,0.03,0.0,0.0,1.99,11.17,13.16,0.04,0.27,-1.0
4,1,1994,14,Chhattisgarh,Durg,430.1,601.7,1399.0,17.5,14.2,...,1000.0,0.0,0.0,0.0,2.07,11.02,13.09,0.03,0.22,-1.0


## Summary by crop

only the crops below have yield statistics

In [128]:
selected_crop = [
'KHARIF SORGHUM', 'RABI SORGHUM' , 'SORGHUM', 'PEARL MILLET', 'MAIZE', 
'FINGER MILLET', 'BARLEY', 'CHICKPEA', 'PIGEONPEA', 'MINOR PULSES', 
'GROUNDNUT', 'SESAMUM', 'RAPESEED AND MUSTARD', 'SAFFLOWER', 'CASTOR', 
'LINSEED', 'SUNFLOWER', 'SOYABEAN', 'OILSEEDS', 'SUGARCANE', 'COTTON']

summary = {'CROP':[],
           'MIN_YEAR':[],
           'MAX_YEAR':[],
           'NUM_YEARS':[],
           'NUM_DISTRICTS':[],
           'DATA_SIZE':[],
           'NAN_IN_YIELD':[]
          }

for crop in selected_crop:
    
    crop_yield_df = crop_stats_df[['Year', crop+' YIELD (Kg per ha)', 'Dist Name']]
    list_years = sorted(crop_yield_df["Year"].unique())
    data_size = crop_yield_df["Year"].count()
    nan_counts = crop_yield_df[crop+' YIELD (Kg per ha)'].isnull().sum()
    district_counts = len(crop_yield_df['Dist Name'].unique())
    
    summary['CROP'].append(crop)
    summary['MIN_YEAR'].append(list_years[0])
    summary['MAX_YEAR'].append(list_years[-1])
    summary['NUM_YEARS'].append(len(list_years))
    summary['NUM_DISTRICTS'].append(district_counts)
    summary['DATA_SIZE'].append(crop_yield_df.shape[0])
    summary['NAN_IN_YIELD'].append(nan_counts)

district_summary = pd.DataFrame(summary)
district_summary = district_summary.sort_values(by='CROP')
district_summary

Unnamed: 0,CROP,MIN_YEAR,MAX_YEAR,NUM_YEARS,NUM_DISTRICTS,DATA_SIZE,NAN_IN_YIELD
6,BARLEY,1990,2017,28,580,13965,0
14,CASTOR,1990,2017,28,580,13965,0
7,CHICKPEA,1990,2017,28,580,13965,0
20,COTTON,1990,2017,28,580,13965,0
5,FINGER MILLET,1990,2017,28,580,13965,0
10,GROUNDNUT,1990,2017,28,580,13965,0
0,KHARIF SORGHUM,1990,2017,28,580,13965,0
15,LINSEED,1990,2017,28,580,13965,0
4,MAIZE,1990,2017,28,580,13965,0
9,MINOR PULSES,1990,2017,28,580,13965,0


## Export yield statistics for selected crops

In [57]:
def filter_df(csv_path, selected_crops):
    
    """
    params:
    csv_path(str): india crop statistics file path
    selected_crops(list): list of crops for which a
                          master dataframe will be created
    
    returns
    a dataframe containing yield information 
    at district level
    
    """
    
    assert len(selected_crops) is not None, "no crop(s) provided"
    
    # read csv
    df = pd.read_csv(csv_path)
    
    # add additional columns
    req_columns = ['Year', 'Dist Code', 'Dist Name'] 
    
    rename_mapping = {
    'Year': 'YEAR',
    'Dist Code': 'DISTRICT_CODE',
    'Dist Name': 'DISTRICT_NAME'
    }

    # create master df
    if len(selected_crops) > 1:
        list_df = []
        
        for crop in selected_crops:
            crop_df = df[req_columns + [crop+' YIELD (Kg per ha)']]
                         
            # rename fields
            crop_df =  crop_df.rename(columns={crop+' YIELD (Kg per ha)':'YIELD'})
            crop_df =  crop_df.rename(columns=rename_mapping)
            crop_df['CROP_NAME'] = crop
            
            list_df.append(crop_df)
                           
        master_df = pd.concat(list_df, axis=0)
                           
    else:
        # rename fields
        master_df = df[req_columns + [selected_crops[0]+' YIELD (Kg per ha)']]
        master_df =  master_df.rename(columns={selected_crops[0]+' YIELD (Kg per ha)':'YIELD'})
        master_df =  master_df.rename(columns=rename_mapping)
        master_df['CROP_NAME'] = selected_crops[0]
                           
    return master_df

In [60]:
selected_crops = ["MAIZE", "RICE", "WHEAT"]
yield_df = filter_df('/app/dev/AgML/ICRISAT-District Level Data.csv', selected_crops)  

print(yield_df.head())

   YEAR  DISTRICT_CODE DISTRICT_NAME   YIELD CROP_NAME
0  1990              1          Durg   500.0     MAIZE
1  1991              1          Durg  1000.0     MAIZE
2  1992              1          Durg  2000.0     MAIZE
3  1993              1          Durg  1000.0     MAIZE
4  1994              1          Durg  1000.0     MAIZE


## Save the data

In [61]:
# yield_df.to_csv('/app/dev/AgML/' + "YIELD_INDIA.csv")