In [2]:
import pandas as pd

In [7]:

# Read in the data to be combined

crop_data = pd.read_csv('../data/crop_data.csv', index_col=False)
ndvi_data = pd.read_csv('../data/ndvi_data.csv', index_col=False)
gpm_data = pd.read_csv('../data/gpm_data.csv', index_col=False)


In [8]:
print("Number of rows:", len(crop_data))
print(crop_data.columns)

Number of rows: 6887
Index(['year', 'state_name', 'county_name', 'Value'], dtype='object')


By the end of this, we should have 6887 rows of observations where 1 row represents a single year of NDVI/GPM monthly data for a single county, with its corn production value as the response variable.

In [34]:
# Rename state_name and value columns to make a smoother joining process
crop_data.rename(columns={'state_name': 'state', 'Value': 'production'}, inplace=True)


crop_data['state'] = crop_data['state'].str.lower()
ndvi_data['state'] = ndvi_data['state'].str.lower()
gpm_data['state'] = gpm_data['state'].str.lower()

# Join the crop data on state_name and year with the ndvi data and the gpm data
df = pd.merge(crop_data, ndvi_data, on=['year', 'state'])
df = pd.merge(df, gpm_data, on=['year', 'state'])


In [37]:
df.head()

Unnamed: 0,year,state,county_name,production,sample_val_April,sample_val_May,sample_val_June,sample_val_July,sample_val_August,sample_val_September,...,max_val_October,max_val_November,precip_april,precip_may,precip_june,precip_july,precip_august,precip_september,precip_october,precip_november
0,2019,illinois,OTHER (COMBINED) COUNTIES,406.422491,0.299199,0.342944,0.440607,0.756683,0.859391,0.709855,...,0.452,0.307,141.291809,195.180389,136.249619,84.797669,102.31855,131.399902,122.606499,73.484634
1,2019,illinois,BUREAU,957.53139,0.299199,0.342944,0.440607,0.756683,0.859391,0.709855,...,0.452,0.307,141.291809,195.180389,136.249619,84.797669,102.31855,131.399902,122.606499,73.484634
2,2019,illinois,CARROLL,672.75623,0.299199,0.342944,0.440607,0.756683,0.859391,0.709855,...,0.452,0.307,141.291809,195.180389,136.249619,84.797669,102.31855,131.399902,122.606499,73.484634
3,2019,illinois,HENRY,919.251471,0.299199,0.342944,0.440607,0.756683,0.859391,0.709855,...,0.452,0.307,141.291809,195.180389,136.249619,84.797669,102.31855,131.399902,122.606499,73.484634
4,2019,illinois,JO DAVIESS,459.130408,0.299199,0.342944,0.440607,0.756683,0.859391,0.709855,...,0.452,0.307,141.291809,195.180389,136.249619,84.797669,102.31855,131.399902,122.606499,73.484634


In [38]:
# Some quick validation
crop_data.describe()

Unnamed: 0,year,production
count,6887.0,6887.0
mean,2016.922898,387.619045
std,2.588095,446.986319
min,2013.0,0.254014
25%,2015.0,120.428064
50%,2017.0,299.457172
75%,2019.0,563.987411
max,2021.0,19878.454274


In [39]:
gpm_data.describe()

Unnamed: 0,year,precip_april,precip_may,precip_june,precip_july,precip_august,precip_september,precip_october,precip_november
count,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0
mean,2017.0,90.024325,121.514387,122.240312,102.391847,100.28389,83.281575,80.764214,58.060511
std,2.596454,43.822501,40.707701,44.157856,33.627497,35.863768,36.698151,34.25968,36.99598
min,2013.0,23.207767,51.717552,44.304073,40.313267,27.387964,21.809034,11.636484,6.271477
25%,2015.0,56.931439,98.192106,94.201893,74.062828,75.914818,57.990151,53.640978,27.655852
50%,2017.0,85.180126,115.526626,118.468403,97.255829,97.767464,75.217903,81.919281,52.298391
75%,2019.0,121.953539,142.508893,149.150452,126.862953,126.136488,101.308161,105.301156,78.812794
max,2021.0,252.642654,262.954773,255.499573,210.203522,183.939621,199.307434,165.044006,224.119049


In [40]:
ndvi_data.describe()

Unnamed: 0,year,sample_val_April,sample_val_May,sample_val_June,sample_val_July,sample_val_August,sample_val_September,sample_val_October,sample_val_November,sample_count_April,...,min_val_October,min_val_November,max_val_April,max_val_May,max_val_June,max_val_July,max_val_August,max_val_September,max_val_October,max_val_November
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,2017.5,0.290115,0.339554,0.536783,0.809182,0.825528,0.57208,0.348015,0.287308,834586.4,...,0.26434,0.2451,0.36252,0.4622,0.7941,0.8683,0.8709,0.7988,0.4526,0.33044
std,2.886751,0.037133,0.034531,0.073154,0.053997,0.043852,0.06486,0.043739,0.03129,715206.0,...,0.026792,0.018315,0.038986,0.054844,0.041149,0.031427,0.028613,0.048615,0.048636,0.034992
min,2013.0,0.232338,0.275682,0.405137,0.64394,0.669246,0.428737,0.282752,0.241037,87451.0,...,0.236,0.218,0.306,0.403,0.732,0.786,0.797,0.68,0.386,0.276
25%,2015.0,0.261735,0.315949,0.484361,0.784724,0.812654,0.527642,0.317946,0.264302,289016.0,...,0.243,0.228,0.331,0.428,0.775,0.856,0.86,0.794,0.432,0.307
50%,2017.5,0.284441,0.334982,0.529642,0.824586,0.839373,0.568382,0.339653,0.280414,549750.5,...,0.253,0.2455,0.368,0.456,0.785,0.878,0.8775,0.823,0.4455,0.326
75%,2020.0,0.318594,0.361767,0.578193,0.850085,0.853031,0.614535,0.363983,0.301226,1432098.0,...,0.284,0.258,0.39,0.466,0.828,0.891,0.89,0.824,0.455,0.345
max,2022.0,0.371543,0.430166,0.747198,0.887348,0.882953,0.729046,0.494029,0.377816,2709084.0,...,0.325,0.283,0.492,0.601,0.863,0.897,0.903,0.843,0.578,0.418


Awesome. We now have our combined data in a readable and understandable format. All that's left to do is to start modeling it.

In [41]:
df.to_csv('../data/dataset.csv', index=False)