# <u>Drought Prediction</u>: Preprocessing - Resample [Mean, Max, Min], Merge

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn
import pandas as pd
import numpy as np

#### Load Training Dataset and Soil Dataset.  Convert Training Dataset date variable from object to datetime.

In [2]:
# Local location of the data
# local_data = 'D:\\Data_Science\\DroughtProject\\Data\\' # Location on Windows
local_data = '/home/chad/Data/Drought_Prediction/' # Location on Linux

# Load the training set and the soil variables.
soil_set = pd.read_csv(local_data + 'soil_data.csv')
train_set = pd.read_csv(local_data + 'train_timeseries.csv',
                        parse_dates=['date'],
                        header=0)

#### Confirm datasets are properly loaded and contain expected datatypes.

In [3]:
soil_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3109 entries, 0 to 3108
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fips           3109 non-null   int64  
 1   lat            3109 non-null   float64
 2   lon            3109 non-null   float64
 3   elevation      3109 non-null   int64  
 4   slope1         3109 non-null   float64
 5   slope2         3109 non-null   float64
 6   slope3         3109 non-null   float64
 7   slope4         3109 non-null   float64
 8   slope5         3109 non-null   float64
 9   slope6         3109 non-null   float64
 10  slope7         3109 non-null   float64
 11  slope8         3109 non-null   float64
 12  aspectN        3109 non-null   float64
 13  aspectE        3109 non-null   float64
 14  aspectS        3109 non-null   float64
 15  aspectW        3109 non-null   float64
 16  aspectUnknown  3109 non-null   float64
 17  WAT_LAND       3109 non-null   float64
 18  NVG_LAND

In [4]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19300680 entries, 0 to 19300679
Data columns (total 21 columns):
 #   Column       Dtype         
---  ------       -----         
 0   fips         int64         
 1   date         datetime64[ns]
 2   PRECTOT      float64       
 3   PS           float64       
 4   QV2M         float64       
 5   T2M          float64       
 6   T2MDEW       float64       
 7   T2MWET       float64       
 8   T2M_MAX      float64       
 9   T2M_MIN      float64       
 10  T2M_RANGE    float64       
 11  TS           float64       
 12  WS10M        float64       
 13  WS10M_MAX    float64       
 14  WS10M_MIN    float64       
 15  WS10M_RANGE  float64       
 16  WS50M        float64       
 17  WS50M_MAX    float64       
 18  WS50M_MIN    float64       
 19  WS50M_RANGE  float64       
 20  score        float64       
dtypes: datetime64[ns](1), float64(19), int64(1)
memory usage: 3.0 GB


In [5]:
soil_set.head()

Unnamed: 0,fips,lat,lon,elevation,slope1,slope2,slope3,slope4,slope5,slope6,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
0,1001,32.536382,-86.64449,63,0.0419,0.2788,0.2984,0.2497,0.1142,0.017,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1005,31.87067,-85.405456,146,0.0158,0.1868,0.5441,0.2424,0.0106,0.0003,...,72.578804,1.828159,74.40696,3,2,1,1,1,1,1
2,1003,30.659218,-87.746067,52,0.0746,0.437,0.4415,0.0469,0.0,0.0,...,59.843639,2.996914,62.840553,3,2,1,2,1,1,1
3,1007,33.015893,-87.127148,93,0.0144,0.1617,0.3714,0.3493,0.0898,0.0134,...,1.916593,0.00833,1.924924,3,2,1,1,1,1,1
4,1009,33.977448,-86.567246,198,0.005,0.0872,0.2799,0.3576,0.1477,0.1037,...,1.891909,0.027488,1.919397,3,2,1,1,1,1,1


In [6]:
train_set.head()

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
0,1001,2000-01-01,0.22,100.51,9.65,14.74,13.51,13.51,20.96,11.46,...,14.65,2.2,2.94,1.49,1.46,4.85,6.04,3.23,2.81,
1,1001,2000-01-02,0.2,100.55,10.42,16.69,14.71,14.71,22.8,12.61,...,16.6,2.52,3.43,1.83,1.6,5.33,6.13,3.72,2.41,
2,1001,2000-01-03,3.65,100.15,11.76,18.49,16.52,16.52,22.73,15.32,...,18.41,4.03,5.33,2.66,2.67,7.53,9.52,5.87,3.66,
3,1001,2000-01-04,15.95,100.29,6.42,11.4,6.09,6.1,18.09,2.16,...,11.31,3.84,5.67,2.08,3.59,6.73,9.31,3.74,5.58,1.0
4,1001,2000-01-05,0.0,101.15,2.95,3.86,-3.29,-3.2,10.82,-2.66,...,2.65,1.6,2.5,0.52,1.98,2.94,4.85,0.65,4.19,


### Resample Meteorolgical Variables to weekly variables with non-null Score values.

In [7]:
# For each county ('fips'), since the score value is set on Tuesday, 
# all variables are averaged from the week leading up to Tuesday: previous Wednesday to Tuesday.
train_set_mean = train_set.groupby('fips').resample('W-TUE', on='date').mean()
train_set_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,fips,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,T2M_RANGE,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,2000-01-04,1001.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,10.387500,10.752500,15.242500,3.147500,4.342500,2.015000,2.330000,6.110000,7.750000,4.140000,3.615000,1.0
1001,2000-01-11,1001.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,3.071429,12.662857,8.558571,2.200000,3.305714,1.238571,2.065714,4.418571,6.588571,2.280000,4.307143,2.0
1001,2000-01-18,1001.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,3.757143,13.177143,9.975714,2.352857,3.257143,1.490000,1.767143,4.870000,6.508571,2.812857,3.692857,2.0
1001,2000-01-25,1001.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,-1.282857,11.498571,4.508571,2.932857,4.090000,1.814286,2.275714,5.270000,7.174286,3.575714,3.600000,2.0
1001,2000-02-01,1001.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,-3.271429,9.547143,0.781429,2.362857,3.354286,1.318571,2.037143,4.372857,6.177143,2.580000,3.594286,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56043,2016-12-06,56043.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,-9.621429,9.200000,-5.887143,3.517143,5.534286,1.837143,3.694286,5.210000,7.768571,2.905714,4.862857,0.0
56043,2016-12-13,56043.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,-11.791429,9.260000,-8.364286,3.370000,5.521429,1.435714,4.085714,5.080000,7.635714,2.311429,5.322857,0.0
56043,2016-12-20,56043.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,-15.451429,11.642857,-10.667143,3.735714,5.688571,1.648571,4.038571,5.287143,7.750000,2.740000,5.008571,0.0
56043,2016-12-27,56043.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,-11.805714,12.081429,-8.218571,4.237143,6.014286,2.238571,3.775714,6.342857,8.890000,3.630000,5.258571,0.0


In [8]:
# The Groupby returns a MultiIndex.
train_set_mean.index

MultiIndex([( 1001, '2000-01-04'),
            ( 1001, '2000-01-11'),
            ( 1001, '2000-01-18'),
            ( 1001, '2000-01-25'),
            ( 1001, '2000-02-01'),
            ( 1001, '2000-02-08'),
            ( 1001, '2000-02-15'),
            ( 1001, '2000-02-22'),
            ( 1001, '2000-02-29'),
            ( 1001, '2000-03-07'),
            ...
            (56043, '2016-11-01'),
            (56043, '2016-11-08'),
            (56043, '2016-11-15'),
            (56043, '2016-11-22'),
            (56043, '2016-11-29'),
            (56043, '2016-12-06'),
            (56043, '2016-12-13'),
            (56043, '2016-12-20'),
            (56043, '2016-12-27'),
            (56043, '2017-01-03')],
           names=['fips', 'date'], length=2759904)

In [9]:
# 'fips' is both part of the MultiIndex and a copied column.
#  Need to rename (or delete) before resetting the index.
train_set_mean.rename({'fips': 'fips_copy'}, axis=1, inplace=True)

train_set_mean.reset_index(inplace=True)
train_set_mean

Unnamed: 0,fips,date,fips_copy,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,...,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
0,1001,2000-01-04,1001.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,...,15.242500,3.147500,4.342500,2.015000,2.330000,6.110000,7.750000,4.140000,3.615000,1.0
1,1001,2000-01-11,1001.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,...,8.558571,2.200000,3.305714,1.238571,2.065714,4.418571,6.588571,2.280000,4.307143,2.0
2,1001,2000-01-18,1001.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,...,9.975714,2.352857,3.257143,1.490000,1.767143,4.870000,6.508571,2.812857,3.692857,2.0
3,1001,2000-01-25,1001.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,...,4.508571,2.932857,4.090000,1.814286,2.275714,5.270000,7.174286,3.575714,3.600000,2.0
4,1001,2000-02-01,1001.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,...,0.781429,2.362857,3.354286,1.318571,2.037143,4.372857,6.177143,2.580000,3.594286,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,56043.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,...,-5.887143,3.517143,5.534286,1.837143,3.694286,5.210000,7.768571,2.905714,4.862857,0.0
2759900,56043,2016-12-13,56043.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,...,-8.364286,3.370000,5.521429,1.435714,4.085714,5.080000,7.635714,2.311429,5.322857,0.0
2759901,56043,2016-12-20,56043.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,...,-10.667143,3.735714,5.688571,1.648571,4.038571,5.287143,7.750000,2.740000,5.008571,0.0
2759902,56043,2016-12-27,56043.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,...,-8.218571,4.237143,6.014286,2.238571,3.775714,6.342857,8.890000,3.630000,5.258571,0.0


In [10]:
# Confirming expected column dataypes, overall size, memory usage, etc.
train_set_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2759904 entries, 0 to 2759903
Data columns (total 22 columns):
 #   Column       Dtype         
---  ------       -----         
 0   fips         int64         
 1   date         datetime64[ns]
 2   fips_copy    float64       
 3   PRECTOT      float64       
 4   PS           float64       
 5   QV2M         float64       
 6   T2M          float64       
 7   T2MDEW       float64       
 8   T2MWET       float64       
 9   T2M_MAX      float64       
 10  T2M_MIN      float64       
 11  T2M_RANGE    float64       
 12  TS           float64       
 13  WS10M        float64       
 14  WS10M_MAX    float64       
 15  WS10M_MIN    float64       
 16  WS10M_RANGE  float64       
 17  WS50M        float64       
 18  WS50M_MAX    float64       
 19  WS50M_MIN    float64       
 20  WS50M_RANGE  float64       
 21  score        float64       
dtypes: datetime64[ns](1), float64(20), int64(1)
memory usage: 463.2 MB


In [11]:
# The last Score value is NaN.  Filling that value with last valid value.
train_set_mean.fillna(method='ffill', inplace=True)

In [12]:
# 'fips_copy' is a copy and has been verified as no longer needed.
train_set_mean.drop('fips_copy', axis=1, inplace=True)

In [13]:
# Confirming proper structure and expected output.
train_set_mean

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
0,1001,2000-01-04,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,10.387500,...,15.242500,3.147500,4.342500,2.015000,2.330000,6.110000,7.750000,4.140000,3.615000,1.0
1,1001,2000-01-11,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,3.071429,...,8.558571,2.200000,3.305714,1.238571,2.065714,4.418571,6.588571,2.280000,4.307143,2.0
2,1001,2000-01-18,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,3.757143,...,9.975714,2.352857,3.257143,1.490000,1.767143,4.870000,6.508571,2.812857,3.692857,2.0
3,1001,2000-01-25,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,-1.282857,...,4.508571,2.932857,4.090000,1.814286,2.275714,5.270000,7.174286,3.575714,3.600000,2.0
4,1001,2000-02-01,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,-3.271429,...,0.781429,2.362857,3.354286,1.318571,2.037143,4.372857,6.177143,2.580000,3.594286,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,-9.621429,...,-5.887143,3.517143,5.534286,1.837143,3.694286,5.210000,7.768571,2.905714,4.862857,0.0
2759900,56043,2016-12-13,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,-11.791429,...,-8.364286,3.370000,5.521429,1.435714,4.085714,5.080000,7.635714,2.311429,5.322857,0.0
2759901,56043,2016-12-20,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,-15.451429,...,-10.667143,3.735714,5.688571,1.648571,4.038571,5.287143,7.750000,2.740000,5.008571,0.0
2759902,56043,2016-12-27,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,-11.805714,...,-8.218571,4.237143,6.014286,2.238571,3.775714,6.342857,8.890000,3.630000,5.258571,0.0


### Repeat process of resampling but use the max() value instead of mean()

In [14]:
# For each county ('fips'), since the score value is set on Tuesday, 
# find the max for all variables from the week leading up to Tuesday: previous Wednesday to Tuesday.
train_set_max = train_set.groupby('fips').resample('W-TUE', on='date').max()

train_set_max.rename({'fips': 'fips_copy', 'date': 'date_copy'}, axis=1, inplace=True)
train_set_max.reset_index(inplace=True)
train_set_max.fillna(method='ffill', inplace=True)
train_set_max.drop(['fips_copy', 'date_copy'], axis=1, inplace=True)

train_set_max

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
0,1001,2000-01-04,15.95,100.55,11.76,18.49,16.52,16.52,22.80,15.32,...,18.41,4.03,5.67,2.66,3.59,7.53,9.52,5.87,5.58,1.0
1,1001,2000-01-11,21.23,101.37,11.18,15.73,15.45,15.45,19.60,10.30,...,15.65,3.10,5.60,2.25,3.88,6.33,10.03,4.95,5.88,2.0
2,1001,2000-01-18,2.40,102.30,9.53,14.28,13.26,13.26,20.74,10.27,...,14.19,3.23,4.57,2.02,2.56,6.41,7.40,4.77,5.17,2.0
3,1001,2000-01-25,13.50,100.81,7.33,9.62,9.00,9.00,15.27,4.88,...,9.54,4.23,5.26,2.41,3.05,7.43,8.69,4.86,5.56,2.0
4,1001,2000-02-01,15.58,101.22,4.56,3.07,2.54,2.55,11.02,0.55,...,3.01,3.26,4.05,1.69,2.79,5.40,6.97,4.07,5.12,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,1.83,83.28,2.73,-1.52,-6.98,-6.91,3.31,-5.66,...,-2.53,5.65,9.52,3.23,6.88,8.51,11.63,5.74,7.14,0.0
2759900,56043,2016-12-13,0.98,83.56,3.90,0.06,-2.40,-2.39,4.19,-3.14,...,-1.18,4.99,9.22,3.33,7.26,7.64,11.76,5.76,8.69,0.0
2759901,56043,2016-12-20,4.14,83.36,3.63,-1.77,-3.57,-3.56,3.58,-7.77,...,-3.13,6.16,8.43,3.98,5.34,8.99,11.55,6.75,6.76,0.0
2759902,56043,2016-12-27,4.88,83.65,3.46,-1.34,-4.02,-4.00,5.22,-6.53,...,-2.70,5.90,9.25,3.61,7.79,8.58,10.90,5.92,8.40,0.0


### Repeat process of resampling but use the min() value instead of mean()

In [15]:
# CREATE FUNCTION FOR THIS PROCESS

# For each county ('fips'), since the score value is set on Tuesday, 
# find the max for all variables from the week leading up to Tuesday: previous Wednesday to Tuesday.
train_set_min = train_set.groupby('fips').resample('W-TUE', on='date').min()

train_set_min.rename({'fips': 'fips_copy', 'date': 'date_copy'}, axis=1, inplace=True)
train_set_min.reset_index(inplace=True)
train_set_min.fillna(method='ffill', inplace=True)
train_set_min.drop(['fips_copy', 'date_copy'], axis=1, inplace=True)

train_set_min

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
0,1001,2000-01-04,0.20,100.15,6.42,11.40,6.09,6.10,18.09,2.16,...,11.31,2.20,2.94,1.49,1.46,4.85,6.04,3.23,2.41,1.0
1,1001,2000-01-11,0.00,99.78,2.95,3.86,-3.29,-3.20,10.82,-2.96,...,2.65,1.55,2.39,0.04,1.40,2.94,4.85,0.05,2.49,2.0
2,1001,2000-01-18,0.00,100.39,2.82,3.16,-3.74,-3.64,10.57,-3.19,...,3.13,1.39,2.18,0.55,1.24,2.78,5.34,0.93,2.55,2.0
3,1001,2000-01-25,0.00,99.87,2.05,-0.78,-7.93,-7.72,5.65,-5.46,...,-0.61,2.20,3.08,1.21,1.53,4.17,5.00,2.39,1.81,2.0
4,1001,2000-02-01,0.00,100.75,1.73,-2.30,-9.76,-9.46,2.51,-5.32,...,-2.10,1.90,2.74,0.88,1.35,3.75,5.04,1.27,2.14,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.00,82.25,0.91,-11.96,-18.65,-17.90,-6.85,-15.24,...,-11.98,1.65,2.36,0.73,1.63,2.24,3.30,0.83,2.34,0.0
2759900,56043,2016-12-13,0.02,82.13,0.82,-14.59,-19.45,-18.82,-11.73,-19.04,...,-14.34,1.93,3.30,0.15,1.87,2.25,3.89,0.21,3.05,0.0
2759901,56043,2016-12-20,0.07,81.69,0.51,-19.20,-24.45,-23.10,-13.77,-24.10,...,-20.13,2.02,3.20,0.10,2.38,2.71,4.85,0.09,3.47,0.0
2759902,56043,2016-12-27,0.00,81.72,1.45,-8.75,-14.14,-13.79,-1.71,-13.60,...,-10.68,2.94,4.16,1.01,1.44,4.70,6.76,1.38,2.37,0.0


### Merge Mean, Min, & Max Resample

In [16]:
# Add '_mean' suffix so when the tables are joined, the variable suffixes have a standard meaning.
train_set_mean = train_set_mean.add_suffix('_mean')
train_set_max = train_set_max.add_suffix('_max')
train_set_min = train_set_min.add_suffix('_min')

In [17]:
train_stats_temp = train_set_mean.join(train_set_max, how='inner', rsuffix = '_max')
train_stats_temp

Unnamed: 0,fips_mean,date_mean,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,T2M_MIN_mean,...,TS_max,WS10M_max,WS10M_MAX_max,WS10M_MIN_max,WS10M_RANGE_max,WS50M_max,WS50M_MAX_max,WS50M_MIN_max,WS50M_RANGE_max,score_max
0,1001,2000-01-04,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,10.387500,...,18.41,4.03,5.67,2.66,3.59,7.53,9.52,5.87,5.58,1.0
1,1001,2000-01-11,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,3.071429,...,15.65,3.10,5.60,2.25,3.88,6.33,10.03,4.95,5.88,2.0
2,1001,2000-01-18,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,3.757143,...,14.19,3.23,4.57,2.02,2.56,6.41,7.40,4.77,5.17,2.0
3,1001,2000-01-25,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,-1.282857,...,9.54,4.23,5.26,2.41,3.05,7.43,8.69,4.86,5.56,2.0
4,1001,2000-02-01,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,-3.271429,...,3.01,3.26,4.05,1.69,2.79,5.40,6.97,4.07,5.12,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,-9.621429,...,-2.53,5.65,9.52,3.23,6.88,8.51,11.63,5.74,7.14,0.0
2759900,56043,2016-12-13,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,-11.791429,...,-1.18,4.99,9.22,3.33,7.26,7.64,11.76,5.76,8.69,0.0
2759901,56043,2016-12-20,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,-15.451429,...,-3.13,6.16,8.43,3.98,5.34,8.99,11.55,6.75,6.76,0.0
2759902,56043,2016-12-27,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,-11.805714,...,-2.70,5.90,9.25,3.61,7.79,8.58,10.90,5.92,8.40,0.0


In [18]:
train_stats =  train_stats_temp.join(train_set_min, how='inner', rsuffix='_min')
train_stats

Unnamed: 0,fips_mean,date_mean,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,T2M_MIN_mean,...,TS_min,WS10M_min,WS10M_MAX_min,WS10M_MIN_min,WS10M_RANGE_min,WS50M_min,WS50M_MAX_min,WS50M_MIN_min,WS50M_RANGE_min,score_min
0,1001,2000-01-04,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,10.387500,...,11.31,2.20,2.94,1.49,1.46,4.85,6.04,3.23,2.41,1.0
1,1001,2000-01-11,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,3.071429,...,2.65,1.55,2.39,0.04,1.40,2.94,4.85,0.05,2.49,2.0
2,1001,2000-01-18,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,3.757143,...,3.13,1.39,2.18,0.55,1.24,2.78,5.34,0.93,2.55,2.0
3,1001,2000-01-25,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,-1.282857,...,-0.61,2.20,3.08,1.21,1.53,4.17,5.00,2.39,1.81,2.0
4,1001,2000-02-01,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,-3.271429,...,-2.10,1.90,2.74,0.88,1.35,3.75,5.04,1.27,2.14,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,-9.621429,...,-11.98,1.65,2.36,0.73,1.63,2.24,3.30,0.83,2.34,0.0
2759900,56043,2016-12-13,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,-11.791429,...,-14.34,1.93,3.30,0.15,1.87,2.25,3.89,0.21,3.05,0.0
2759901,56043,2016-12-20,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,-15.451429,...,-20.13,2.02,3.20,0.10,2.38,2.71,4.85,0.09,3.47,0.0
2759902,56043,2016-12-27,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,-11.805714,...,-10.68,2.94,4.16,1.01,1.44,4.70,6.76,1.38,2.37,0.0


In [19]:
# The Date, FIPS, and Score values don't have min or max values different from mean and are therefore duplicates.
train_stats.drop(['fips_max', 'date_max', 'score_max', 'fips_min', 'date_min', 'score_min'], axis=1, inplace=True)

In [20]:
train_stats

Unnamed: 0,fips_mean,date_mean,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,T2M_MIN_mean,...,T2M_RANGE_min,TS_min,WS10M_min,WS10M_MAX_min,WS10M_MIN_min,WS10M_RANGE_min,WS50M_min,WS50M_MAX_min,WS50M_MIN_min,WS50M_RANGE_min
0,1001,2000-01-04,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,10.387500,...,7.41,11.31,2.20,2.94,1.49,1.46,4.85,6.04,3.23,2.41
1,1001,2000-01-11,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,3.071429,...,9.31,2.65,1.55,2.39,0.04,1.40,2.94,4.85,0.05,2.49
2,1001,2000-01-18,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,3.757143,...,9.06,3.13,1.39,2.18,0.55,1.24,2.78,5.34,0.93,2.55
3,1001,2000-01-25,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,-1.282857,...,8.36,-0.61,2.20,3.08,1.21,1.53,4.17,5.00,2.39,1.81
4,1001,2000-02-01,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,-3.271429,...,3.25,-2.10,1.90,2.74,0.88,1.35,3.75,5.04,1.27,2.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,-9.621429,...,5.17,-11.98,1.65,2.36,0.73,1.63,2.24,3.30,0.83,2.34
2759900,56043,2016-12-13,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,-11.791429,...,5.35,-14.34,1.93,3.30,0.15,1.87,2.25,3.89,0.21,3.05
2759901,56043,2016-12-20,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,-15.451429,...,6.78,-20.13,2.02,3.20,0.10,2.38,2.71,4.85,0.09,3.47
2759902,56043,2016-12-27,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,-11.805714,...,9.98,-10.68,2.94,4.16,1.01,1.44,4.70,6.76,1.38,2.37


In [21]:
train_stats.rename({'fips_mean': 'fips', 'date_mean': 'date', 'score_mean':'score'}, axis=1, inplace=True)

In [22]:
train_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2759904 entries, 0 to 2759903
Data columns (total 57 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   PRECTOT_mean      float64       
 3   PS_mean           float64       
 4   QV2M_mean         float64       
 5   T2M_mean          float64       
 6   T2MDEW_mean       float64       
 7   T2MWET_mean       float64       
 8   T2M_MAX_mean      float64       
 9   T2M_MIN_mean      float64       
 10  T2M_RANGE_mean    float64       
 11  TS_mean           float64       
 12  WS10M_mean        float64       
 13  WS10M_MAX_mean    float64       
 14  WS10M_MIN_mean    float64       
 15  WS10M_RANGE_mean  float64       
 16  WS50M_mean        float64       
 17  WS50M_MAX_mean    float64       
 18  WS50M_MIN_mean    float64       
 19  WS50M_RANGE_mean  float64       
 20  score             float64       
 21  PRECTOT_

In [23]:
# Rearranging the columns so that fips, date, and score are the first three columns.

cols = train_stats.columns.tolist()
cols = cols[0:2] + [cols[20]] + cols[2:20] + cols[21:]
# type(cols)
cols

['fips',
 'date',
 'score',
 'PRECTOT_mean',
 'PS_mean',
 'QV2M_mean',
 'T2M_mean',
 'T2MDEW_mean',
 'T2MWET_mean',
 'T2M_MAX_mean',
 'T2M_MIN_mean',
 'T2M_RANGE_mean',
 'TS_mean',
 'WS10M_mean',
 'WS10M_MAX_mean',
 'WS10M_MIN_mean',
 'WS10M_RANGE_mean',
 'WS50M_mean',
 'WS50M_MAX_mean',
 'WS50M_MIN_mean',
 'WS50M_RANGE_mean',
 'PRECTOT_max',
 'PS_max',
 'QV2M_max',
 'T2M_max',
 'T2MDEW_max',
 'T2MWET_max',
 'T2M_MAX_max',
 'T2M_MIN_max',
 'T2M_RANGE_max',
 'TS_max',
 'WS10M_max',
 'WS10M_MAX_max',
 'WS10M_MIN_max',
 'WS10M_RANGE_max',
 'WS50M_max',
 'WS50M_MAX_max',
 'WS50M_MIN_max',
 'WS50M_RANGE_max',
 'PRECTOT_min',
 'PS_min',
 'QV2M_min',
 'T2M_min',
 'T2MDEW_min',
 'T2MWET_min',
 'T2M_MAX_min',
 'T2M_MIN_min',
 'T2M_RANGE_min',
 'TS_min',
 'WS10M_min',
 'WS10M_MAX_min',
 'WS10M_MIN_min',
 'WS10M_RANGE_min',
 'WS50M_min',
 'WS50M_MAX_min',
 'WS50M_MIN_min',
 'WS50M_RANGE_min']

In [24]:
train_stats = train_stats[cols]
train_stats

Unnamed: 0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,T2M_RANGE_min,TS_min,WS10M_min,WS10M_MAX_min,WS10M_MIN_min,WS10M_RANGE_min,WS50M_min,WS50M_MAX_min,WS50M_MIN_min,WS50M_RANGE_min
0,1001,2000-01-04,1.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,...,7.41,11.31,2.20,2.94,1.49,1.46,4.85,6.04,3.23,2.41
1,1001,2000-01-11,2.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,...,9.31,2.65,1.55,2.39,0.04,1.40,2.94,4.85,0.05,2.49
2,1001,2000-01-18,2.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,...,9.06,3.13,1.39,2.18,0.55,1.24,2.78,5.34,0.93,2.55
3,1001,2000-01-25,2.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,...,8.36,-0.61,2.20,3.08,1.21,1.53,4.17,5.00,2.39,1.81
4,1001,2000-02-01,1.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,...,3.25,-2.10,1.90,2.74,0.88,1.35,3.75,5.04,1.27,2.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,...,5.17,-11.98,1.65,2.36,0.73,1.63,2.24,3.30,0.83,2.34
2759900,56043,2016-12-13,0.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,...,5.35,-14.34,1.93,3.30,0.15,1.87,2.25,3.89,0.21,3.05
2759901,56043,2016-12-20,0.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,...,6.78,-20.13,2.02,3.20,0.10,2.38,2.71,4.85,0.09,3.47
2759902,56043,2016-12-27,0.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,...,9.98,-10.68,2.94,4.16,1.01,1.44,4.70,6.76,1.38,2.37


In [25]:
train_stats.describe()

Unnamed: 0,fips,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,T2M_MIN_mean,...,T2M_RANGE_min,TS_min,WS10M_min,WS10M_MAX_min,WS10M_MIN_min,WS10M_RANGE_min,WS50M_min,WS50M_MAX_min,WS50M_MIN_min,WS50M_RANGE_min
count,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,...,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0,2759904.0
mean,30670.38,0.8247404,2.644114,96.65571,7.81315,12.79292,6.944456,6.980326,18.67214,7.40381,...,7.41751,9.267097,2.177891,3.323689,0.6618041,1.829856,3.471457,5.594512,0.9430591,2.699401
std,14979.11,1.223709,3.04634,5.428852,4.380519,10.53505,9.490316,9.444574,11.02417,10.14268,...,2.983333,11.72647,0.921192,1.393489,0.5522409,0.9059153,1.220201,1.703761,0.8109002,1.012858
min,1001.0,0.0,0.0,67.35286,0.3485714,-28.36286,-28.70143,-28.60429,-23.09143,-34.78429,...,0.03,-38.23,0.25,0.54,0.0,0.16,0.4,0.7,0.0,0.32
25%,19044.5,0.0,0.4642857,95.89714,4.081429,4.812857,-0.44,-0.3914286,10.66714,-0.3557143,...,5.25,0.45,1.47,2.22,0.26,1.12,2.56,4.33,0.37,1.97
50%,29212.0,0.0,1.687143,98.31429,6.96,13.74571,7.367143,7.378571,20.07429,7.837143,...,7.21,9.72,2.0,3.04,0.5,1.64,3.3,5.37,0.7,2.52
75%,46007.5,1.1902,3.777143,99.93714,11.10714,21.63714,14.92,14.92143,27.62429,15.82,...,9.37,19.03,2.73,4.19,0.9,2.37,4.21,6.71,1.28,3.25
max,56043.0,5.0,53.35429,103.0429,21.15143,39.02429,26.18,26.18,47.09714,31.86286,...,22.4,40.32,9.59,13.5,8.53,9.32,12.04,15.81,8.86,11.43


### Directly using Merge to correctly join on specified column

In [26]:
train_soil_stats = pd.merge(train_stats, soil_set, on='fips', how='inner')

In [27]:
train_soil_stats

Unnamed: 0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
0,1001,2000-01-04,1.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1001,2000-01-11,2.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
2,1001,2000-01-18,2.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
3,1001,2000-01-25,2.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
4,1001,2000-02-01,1.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759900,56043,2016-12-13,0.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759901,56043,2016-12-20,0.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759902,56043,2016-12-27,0.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1


#### There are the same number of rows in the training set and the merged dataset.

In [28]:
train_soil_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759904 entries, 0 to 2759903
Data columns (total 88 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   score             float64       
 3   PRECTOT_mean      float64       
 4   PS_mean           float64       
 5   QV2M_mean         float64       
 6   T2M_mean          float64       
 7   T2MDEW_mean       float64       
 8   T2MWET_mean       float64       
 9   T2M_MAX_mean      float64       
 10  T2M_MIN_mean      float64       
 11  T2M_RANGE_mean    float64       
 12  TS_mean           float64       
 13  WS10M_mean        float64       
 14  WS10M_MAX_mean    float64       
 15  WS10M_MIN_mean    float64       
 16  WS10M_RANGE_mean  float64       
 17  WS50M_mean        float64       
 18  WS50M_MAX_mean    float64       
 19  WS50M_MIN_mean    float64       
 20  WS50M_RANGE_mean  float64       
 21  PRECTOT_

### Exporting the Merged Training and Soil Dataset 

In [30]:
# Export the merged training (meteorological) data that has been resampled with mean values
# and the soil data that does not vary with time.
train_soil_stats.to_csv(local_data + 'train_soil_stats.csv',
                       index_label='index')