# <u>Drought Prediction</u>: Test Set - Scale

In [1]:
#Import pandas, numpy, and StandardScaler
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

#### Load Training Dataset and Soil Dataset.  Convert Training Dataset date variable from object to datetime.

In [2]:
# Local location of the data
# local_data = 'D:\\Data_Science\\DroughtProject\\Data\\' # Location on Windows
local_data = '/home/chad/Data/Drought_Prediction/' # Location on Linux

# Load the dataset that contains training (meteorological variables) resampled weekly with mean, max, min
# and the soil variables that have been merged on the county 'fips' value
tsm = pd.read_csv(local_data + 'train_soil_stats.csv',
                        parse_dates=['date'],
                        index_col=['index'],
                        header=0)

testval = pd.read_csv(local_data + 'testval_soil_stats.csv',
                        parse_dates=['date'],
                        index_col=['index'],
                        header=0)

#### Confirm datasets are properly loaded and contain expected datatypes.

In [3]:
tsm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759904 entries, 0 to 2759903
Data columns (total 88 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   score             float64       
 3   PRECTOT_mean      float64       
 4   PS_mean           float64       
 5   QV2M_mean         float64       
 6   T2M_mean          float64       
 7   T2MDEW_mean       float64       
 8   T2MWET_mean       float64       
 9   T2M_MAX_mean      float64       
 10  T2M_MIN_mean      float64       
 11  T2M_RANGE_mean    float64       
 12  TS_mean           float64       
 13  WS10M_mean        float64       
 14  WS10M_MAX_mean    float64       
 15  WS10M_MIN_mean    float64       
 16  WS10M_RANGE_mean  float64       
 17  WS50M_mean        float64       
 18  WS50M_MAX_mean    float64       
 19  WS50M_MIN_mean    float64       
 20  WS50M_RANGE_mean  float64       
 21  PRECTOT_

In [4]:
testval.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329448 entries, 0 to 329447
Data columns (total 88 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   fips              329448 non-null  int64         
 1   date              329448 non-null  datetime64[ns]
 2   score             329448 non-null  float64       
 3   PRECTOT_mean      329448 non-null  float64       
 4   PS_mean           329448 non-null  float64       
 5   QV2M_mean         329448 non-null  float64       
 6   T2M_mean          329448 non-null  float64       
 7   T2MDEW_mean       329448 non-null  float64       
 8   T2MWET_mean       329448 non-null  float64       
 9   T2M_MAX_mean      329448 non-null  float64       
 10  T2M_MIN_mean      329448 non-null  float64       
 11  T2M_RANGE_mean    329448 non-null  float64       
 12  TS_mean           329448 non-null  float64       
 13  WS10M_mean        329448 non-null  float64       
 14  WS10

In [5]:
tsm

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2000-01-04,1.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1001,2000-01-11,2.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
2,1001,2000-01-18,2.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
3,1001,2000-01-25,2.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
4,1001,2000-02-01,1.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759900,56043,2016-12-13,0.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759901,56043,2016-12-20,0.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759902,56043,2016-12-27,0.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1


In [6]:
testval

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2019-01-01,0.0000,2.250000,100.510000,9.690000,14.710000,13.550000,13.520000,17.380000,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1001,2019-01-08,0.0000,5.988571,100.325714,7.432857,11.331429,9.334286,9.168571,17.271429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
2,1001,2019-01-15,0.0000,1.595714,100.844286,4.768571,5.620000,2.870000,2.715714,11.077143,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
3,1001,2019-01-22,0.0000,4.118571,100.597143,5.472857,6.492857,3.995714,3.752857,12.291429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
4,1001,2019-01-29,0.0000,8.361429,100.404286,4.734286,5.395714,2.294286,2.107143,11.401429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329443,56043,2020-12-08,3.5109,0.000000,83.890000,1.895714,-1.410000,-11.617143,-6.515714,5.211429,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
329444,56043,2020-12-15,3.5182,1.014286,82.878571,2.211429,-4.488571,-9.947143,-7.217143,-0.511429,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
329445,56043,2020-12-22,3.5182,0.800000,82.717143,2.850000,-1.897143,-6.667143,-4.281429,4.004286,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
329446,56043,2020-12-29,3.5182,0.755714,83.038571,2.067143,-5.272857,-10.781429,-8.025714,0.561429,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1


In [7]:
# Breaking out independent numerical variables from target variable, categorical variable ('fips'), and date.
cols = tsm.columns.tolist()
features = cols[3:]

# Separating out the features
Xtrain = tsm.loc[:, features].values

# Separating out the target
ytrain = tsm.loc[:,['score']].values

# Standardizing the features
scaler = StandardScaler()
scaler.fit(Xtrain)

StandardScaler()

## Standardizing the features in the Test Set from the Scaler fitted to the Training Data

In [8]:
# Breaking out independent numerical variables from target variable, categorical variable ('fips'), and date.
cols = testval.columns.tolist()
features = cols[3:]

# Separating out the features
Xtest = testval.loc[:, features].values

# Separating out the target
ytest = testval.loc[:,['score']].values

# Standardizing the features in the Test Set from the Scaler fitted to the Training Data
Xtest = scaler.transform(Xtest)

In [9]:
# Converting fips (county number) to dataframe to reconstruct dataframe after Standardization.
fips = testval['fips']
fips_df = pd.DataFrame(fips, index=testval.index, columns=['fips'])

# Converting date to dataframe to reconstruct dataframe after Standardization.
date = testval['date']
date_df = pd.DataFrame(date, index=testval.index, columns=['date'])

# Converting score to dataframe to reconstruct dataframe after Standardization.
y_df = pd.DataFrame(ytest, index=testval.index, columns=['score'])

# Converting features to dataframe to reconstruct dataframe after Standardization.
X_df = pd.DataFrame(Xtest, index=testval.index, columns=features)

In [10]:
# Merges fips to date
testval_scaled = pd.concat([fips, date_df], axis=1)

# Merges fips & date, to score
testval_scaled = pd.concat([testval_scaled, y_df], axis=1)

# Merges fips, date, & score, to features
testval_scaled = pd.concat([testval_scaled, X_df], axis=1)

In [11]:
testval_scaled

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2019-01-01,0.0000,-0.129373,0.709964,0.428454,0.181972,0.696030,0.692427,-0.117209,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
1,1001,2019-01-08,0.0000,1.097861,0.676018,-0.086815,-0.138726,0.251818,0.231693,-0.127058,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
2,1001,2019-01-15,0.0000,-0.344150,0.771540,-0.695027,-0.680862,-0.429328,-0.451541,-0.688940,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
3,1001,2019-01-22,0.0000,0.484010,0.726016,-0.534250,-0.598010,-0.310711,-0.341727,-0.578793,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
4,1001,2019-01-29,0.0000,1.876782,0.690491,-0.702854,-0.702152,-0.489991,-0.515977,-0.659524,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329443,56043,2020-12-08,3.5109,-0.867964,-2.351457,-1.350853,-1.348159,-1.955847,-1.428973,-1.221018,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
329444,56043,2020-12-15,3.5182,-0.535012,-2.537763,-1.278781,-1.640381,-1.779878,-1.503241,-1.740137,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
329445,56043,2020-12-22,3.5182,-0.605354,-2.567499,-1.133005,-1.394399,-1.434262,-1.192405,-1.330518,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
329446,56043,2020-12-29,3.5182,-0.619891,-2.508291,-1.311719,-1.714827,-1.867787,-1.588853,-1.642819,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884


In [12]:
testval_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329448 entries, 0 to 329447
Data columns (total 88 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   fips              329448 non-null  int64         
 1   date              329448 non-null  datetime64[ns]
 2   score             329448 non-null  float64       
 3   PRECTOT_mean      329448 non-null  float64       
 4   PS_mean           329448 non-null  float64       
 5   QV2M_mean         329448 non-null  float64       
 6   T2M_mean          329448 non-null  float64       
 7   T2MDEW_mean       329448 non-null  float64       
 8   T2MWET_mean       329448 non-null  float64       
 9   T2M_MAX_mean      329448 non-null  float64       
 10  T2M_MIN_mean      329448 non-null  float64       
 11  T2M_RANGE_mean    329448 non-null  float64       
 12  TS_mean           329448 non-null  float64       
 13  WS10M_mean        329448 non-null  float64       
 14  WS10

In [13]:
testval_scaled.to_csv(local_data + 'testval_soil_stats_scaled.csv',
                       index_label='index')