### Feature engineering
- Objectives: 
    - This notebook conducts feature engineering to merge yield data with climate and NDVI data prior to modeling

In [15]:
# import modules
import pandas as pd

In [29]:
# Read yield, climate, and NDVI data
df_yield = pd.read_excel(r'C:\Users\djava\OneDrive\Documents\Oxford\Projects\india_rice_early_warning\4_data\PROCESSED_DATA\YIELD\india_processed_yield.xlsx')
df_ndvi = pd.read_excel(r'C:\Users\djava\OneDrive\Documents\Oxford\Projects\india_rice_early_warning\4_data\PROCESSED_DATA\NDVI\ndvi_rice_only.xlsx')
df_weather = pd.read_excel(r'C:\Users\djava\OneDrive\Documents\Oxford\Projects\india_rice_early_warning\4_data\PROCESSED_DATA\WEATHER\india_districts_weather_data.xlsx')

#### Process yield data

In [30]:
# Drop Year column and rename Year End to Year
df_yield = df_yield.drop(columns=['Year'])
df_yield = df_yield.rename(columns={'Year End': 'year'})

# Rename Area (Hectare) to Area_ha, Production (Tonnes) to Production_t, and Yield (Tonnes/Hectare) to Yield_t_ha
df_yield = df_yield.rename(columns={'Area (Hectare)': 'Area_ha', 'Production (Tonnes)': 'Production_t', 'Yield (Tonnes/Hectare)': 'Yield_t_ha'})

In [31]:
df_yield

Unnamed: 0,GID_2,State,District,year,Season,Area_ha,Production_t,Yield_t_ha
0,IND.11.12_1,Gujarat,Gandhinagar,2001,Kharif,7600.0,17100.0,2.250000
1,IND.11.12_1,Gujarat,Gandhinagar,2002,Kharif,7000.0,17500.0,2.500000
2,IND.11.12_1,Gujarat,Gandhinagar,2003,Kharif,4600.0,9900.0,2.152174
3,IND.11.12_1,Gujarat,Gandhinagar,2004,Kharif,13100.0,35200.0,2.687023
4,IND.11.12_1,Gujarat,Gandhinagar,2005,Kharif,13200.0,29100.0,2.204545
...,...,...,...,...,...,...,...,...
5115,IND.7.9_1,Chhattisgarh,Dhamtari,2016,Kharif,173361.0,314275.0,1.812836
5116,IND.7.9_1,Chhattisgarh,Dhamtari,2017,Kharif,183504.0,603476.0,3.288626
5117,IND.7.9_1,Chhattisgarh,Dhamtari,2018,Kharif,144868.0,311175.0,2.147990
5118,IND.7.9_1,Chhattisgarh,Dhamtari,2019,Kharif,178763.0,433106.0,2.422794


#### Process climate data

In [32]:
# Extract year and month from time column in df_weather and query for years 2001 to 2020

# Extract years from datetime
df_weather['year'] = pd.DatetimeIndex(df_weather['time']).year 

# Extract months from dateime
df_weather['month'] = pd.DatetimeIndex(df_weather['time']).month

# Keep only relevant years and months
df_weather = df_weather.query('year >= 2001 and year <= 2020')
df_weather = df_weather.query('month >= 5 and month <= 11')

# give  the t2m and tp as rows for each month
df_weather_pivot = df_weather.pivot_table(index=['GID_2', 'year'], columns='month', values=['t2m', 'tp', 'lai_lv', 'pev', 'sp', 'swvl1'])

# Update dataframe to give t2m_5, t2m_6, t2m_7, t2m_8, t2m_9, t2m_10, t2m_11, tp_5, tp_6, tp_7, tp_8, tp_9, tp_10, tp_11
df_weather_pivot.columns = ['_'.join(str(s).strip() for s in col if s) for col in df_weather_pivot.columns]

# make df_weather_pivot a normal dataframe
df_weather_pivot = df_weather_pivot.reset_index()

In [33]:
df_weather_pivot

Unnamed: 0,GID_2,year,lai_lv_5,lai_lv_6,lai_lv_7,lai_lv_8,lai_lv_9,lai_lv_10,lai_lv_11,pev_5,...,t2m_9,t2m_10,t2m_11,tp_5,tp_6,tp_7,tp_8,tp_9,tp_10,tp_11
0,IND.1.2_1,2001,0.721647,0.698776,0.667640,0.671411,0.732512,0.801982,0.822689,-0.000822,...,26.963776,26.681686,26.782242,0.008971,0.008086,0.008806,0.011354,0.008078,0.009862,0.003568
1,IND.1.2_1,2002,0.721647,0.698776,0.667640,0.671411,0.732512,0.801982,0.822689,-0.000985,...,26.789848,27.038239,26.910240,0.006937,0.006844,0.004978,0.008965,0.006565,0.007092,0.007167
2,IND.1.2_1,2003,0.721647,0.698776,0.667640,0.671411,0.732512,0.801982,0.822689,-0.001059,...,26.774086,26.863770,27.651878,0.006019,0.007235,0.013133,0.010763,0.007493,0.008225,0.000710
3,IND.1.2_1,2004,0.721647,0.698776,0.667640,0.671411,0.732512,0.801982,0.822689,-0.000762,...,26.950729,27.227112,27.423866,0.011527,0.011107,0.006972,0.010652,0.007157,0.006041,0.002525
4,IND.1.2_1,2005,0.721647,0.698776,0.667640,0.671411,0.732512,0.801982,0.822689,-0.000931,...,27.032806,26.786316,26.760773,0.006475,0.014033,0.011885,0.006864,0.014474,0.008864,0.006348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12975,IND.8.1_1,2016,2.418103,2.435053,2.459157,2.510069,2.605701,2.585986,2.506102,-0.006919,...,27.089325,27.075195,25.562042,0.000326,0.005848,0.014561,0.013302,0.011364,0.002862,0.000000
12976,IND.8.1_1,2017,2.418103,2.435053,2.459157,2.510069,2.605701,2.585986,2.506102,-0.007053,...,28.153534,28.014404,25.397888,0.000252,0.009998,0.019982,0.014579,0.006303,0.002598,0.000010
12977,IND.8.1_1,2018,2.418103,2.435053,2.459157,2.510069,2.605701,2.585986,2.506102,-0.007211,...,27.255646,29.066650,27.166504,0.000031,0.009285,0.020340,0.009168,0.001955,0.000294,0.000010
12978,IND.8.1_1,2019,2.418103,2.435053,2.459157,2.510069,2.605701,2.585986,2.506102,-0.007149,...,27.054535,28.079620,26.826263,0.000032,0.007067,0.016162,0.019371,0.016436,0.003365,0.001031


#### Process NDVI data

In [34]:
# in df_ndvi Keep time, ndvi, gid_2 comumns
df_ndvi = df_ndvi[[ 'GID_2','time', '_1_km_monthly_NDVI']]

In [35]:
# Extract years from datetime
df_ndvi['year'] = pd.DatetimeIndex(df_ndvi['time']).year 

# Extract months from dateime
df_ndvi['month'] = pd.DatetimeIndex(df_ndvi['time']).month

# Keep only relevant years and months
df_ndvi = df_ndvi.query('year >= 2001 and year <= 2020')
df_ndvi = df_ndvi.query('month >= 5 and month <= 11')

# give  the t2m and tp as rows for each month
df_ndvi_pivot = df_ndvi.pivot_table(index=['GID_2', 'year'], columns='month', values='_1_km_monthly_NDVI')

# Reset index
df_ndvi_pivot = df_ndvi_pivot.reset_index()

# Rename the columns 5, 6, ... , 11 to NDVI_5, NDVI_6, ... , NDVI_11 
df_ndvi_pivot.columns = ['GID_2', 'year', 'NDVI_5', 'NDVI_6', 'NDVI_7', 'NDVI_8', 'NDVI_9', 'NDVI_10', 'NDVI_11']

In [36]:
# Join df_yield and df_weather_pivot
df_yield_weather = df_yield.merge(df_weather_pivot, on=['GID_2', 'year'], how='left')


In [37]:
# Join df_yield_weather and df_ndvi_pivot
df_yield_weather_ndvi = df_yield_weather.merge(df_ndvi_pivot, on=['GID_2', 'year'], how='left')

#### Rename columns

In [39]:
# Rename t2m_5 etc. to Temperature May, Temperature June, ... , Temperature November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'t2m_5': 'Temperature May', 't2m_6': 'Temperature June', 't2m_7': 'Temperature July', 't2m_8': 'Temperature August', 't2m_9': 'Temperature September', 't2m_10': 'Temperature October', 't2m_11': 'Temperature November'})

# Rename tp_5 etc. to Precipitation May, Precipitation June, ... , Precipitation November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'tp_5': 'Precipitation May', 'tp_6': 'Precipitation June', 'tp_7': 'Precipitation July', 'tp_8': 'Precipitation August', 'tp_9': 'Precipitation September', 'tp_10': 'Precipitation October', 'tp_11': 'Precipitation November'})

# Rename NDVI_5 etc. to NDVI May, NDVI June, ... , NDVI November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'NDVI_5': 'NDVI May', 'NDVI_6': 'NDVI June', 'NDVI_7': 'NDVI July', 'NDVI_8': 'NDVI August', 'NDVI_9': 'NDVI September', 'NDVI_10': 'NDVI October', 'NDVI_11': 'NDVI November'})

# Rename lai_lv_5 etc. to LAI May, LAI June, ... , LAI November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'lai_lv_5': 'LAI May', 'lai_lv_6': 'LAI June', 'lai_lv_7': 'LAI July', 'lai_lv_8': 'LAI August', 'lai_lv_9': 'LAI September', 'lai_lv_10': 'LAI October', 'lai_lv_11': 'LAI November'})

# Rename pev_5 etc. to PEV May, PEV June, ... , PEV November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'pev_5': 'PEV May', 'pev_6': 'PEV June', 'pev_7': 'PEV July', 'pev_8': 'PEV August', 'pev_9': 'PEV September', 'pev_10': 'PEV October', 'pev_11': 'PEV November'})

# Rename sp_5 etc. to SP May, SP June, ... , SP November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'sp_5': 'SP May', 'sp_6': 'SP June', 'sp_7': 'SP July', 'sp_8': 'SP August', 'sp_9': 'SP September', 'sp_10': 'SP October', 'sp_11': 'SP November'})

# Rename swvl1_5 etc. to SWVL1 May, SWVL1 June, ... , SWVL1 November
df_yield_weather_ndvi = df_yield_weather_ndvi.rename(columns={'swvl1_5': 'SWVL1 May', 'swvl1_6': 'SWVL1 June', 'swvl1_7': 'SWVL1 July', 'swvl1_8': 'SWVL1 August', 'swvl1_9': 'SWVL1 September', 'swvl1_10': 'SWVL1 October', 'swvl1_11': 'SWVL1 November'})

In [40]:
df_yield_weather_ndvi

Unnamed: 0,GID_2,State,District,year,Season,Area_ha,Production_t,Yield_t_ha,LAI May,LAI June,...,Precipitation September,Precipitation October,Precipitation November,NDVI May,NDVI June,NDVI July,NDVI August,NDVI September,NDVI October,NDVI November
0,IND.11.12_1,Gujarat,Gandhinagar,2001,Kharif,7600.0,17100.0,2.250000,1.290048,1.271173,...,0.000360,0.001253,0.000000,0.353912,0.326378,0.405922,0.609368,0.561315,0.473015,0.401144
1,IND.11.12_1,Gujarat,Gandhinagar,2002,Kharif,7000.0,17500.0,2.500000,1.290048,1.271173,...,0.001236,0.000000,0.000038,0.327856,0.320886,0.395484,0.586203,0.603006,0.467924,0.370233
2,IND.11.12_1,Gujarat,Gandhinagar,2003,Kharif,4600.0,9900.0,2.152174,1.290048,1.271173,...,0.001012,0.000011,0.000000,0.315325,0.299044,0.563758,0.596060,0.579310,0.530602,0.450795
3,IND.11.12_1,Gujarat,Gandhinagar,2004,Kharif,13100.0,35200.0,2.687023,1.290048,1.271173,...,0.000862,0.001163,0.000003,0.323904,0.318026,0.408389,0.578152,0.538458,0.515882,0.476042
4,IND.11.12_1,Gujarat,Gandhinagar,2005,Kharif,13200.0,29100.0,2.204545,1.290048,1.271173,...,0.009802,0.000006,0.000000,0.329544,0.311948,0.375546,0.561531,0.588265,0.565312,0.472057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5115,IND.7.9_1,Chhattisgarh,Dhamtari,2016,Kharif,173361.0,314275.0,1.812836,2.387267,2.421264,...,0.012280,0.003595,0.000037,0.340298,0.370670,0.491839,0.730852,0.765638,0.749241,0.563611
5116,IND.7.9_1,Chhattisgarh,Dhamtari,2017,Kharif,183504.0,603476.0,3.288626,2.387267,2.421264,...,0.007111,0.004265,0.000113,0.382216,0.403482,0.492632,0.753171,0.775475,0.734914,0.562131
5117,IND.7.9_1,Chhattisgarh,Dhamtari,2018,Kharif,144868.0,311175.0,2.147990,2.387267,2.421264,...,0.006590,0.000237,0.000016,0.330148,0.414741,0.565517,0.711291,0.754015,0.685228,0.503421
5118,IND.7.9_1,Chhattisgarh,Dhamtari,2019,Kharif,178763.0,433106.0,2.422794,2.387267,2.421264,...,0.012303,0.005617,0.000032,0.398533,0.358957,0.578997,0.648554,0.813679,0.772194,0.600879


In [41]:
# Save output to excel
df_yield_weather_ndvi.to_excel(r'C:\Users\djava\OneDrive\Documents\Oxford\Projects\india_rice_early_warning\4_data\PROCESSED_DATA\ANALYSIS_READY\india_yield_weather_ndvi.xlsx', index=False)