# <u>Drought Prediction</u>: Preprocessing - Scale

In [1]:
#Import pandas, numpy, and StandardScaler
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

#### Load Training Dataset and Soil Dataset.  Convert Training Dataset date variable from object to datetime.

In [2]:
# Local location of the data
# local_data = 'D:\\Data_Science\\DroughtProject\\Data\\' # Location on Windows
local_data = '/home/chad/Data/Drought_Prediction/' # Location on Linux

# Load the dataset that contains training (meteorological variables) resampled weekly with mean, max, min
# and the soil variables that have been merged on the county 'fips' value
tsm = pd.read_csv(local_data + 'train_soil_stats.csv',
                        parse_dates=['date'],
                        index_col=['index'],
                        header=0)

#### Confirm datasets are properly loaded and contain expected datatypes.

In [3]:
tsm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759904 entries, 0 to 2759903
Data columns (total 88 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   score             float64       
 3   PRECTOT_mean      float64       
 4   PS_mean           float64       
 5   QV2M_mean         float64       
 6   T2M_mean          float64       
 7   T2MDEW_mean       float64       
 8   T2MWET_mean       float64       
 9   T2M_MAX_mean      float64       
 10  T2M_MIN_mean      float64       
 11  T2M_RANGE_mean    float64       
 12  TS_mean           float64       
 13  WS10M_mean        float64       
 14  WS10M_MAX_mean    float64       
 15  WS10M_MIN_mean    float64       
 16  WS10M_RANGE_mean  float64       
 17  WS50M_mean        float64       
 18  WS50M_MAX_mean    float64       
 19  WS50M_MIN_mean    float64       
 20  WS50M_RANGE_mean  float64       
 21  PRECTOT_

In [4]:
tsm

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2000-01-04,1.0,5.005000,100.375000,9.562500,15.330000,12.707500,12.710000,21.145000,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1001,2000-01-11,2.0,5.684286,100.665714,5.951429,9.192857,4.908571,4.934286,15.732857,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
2,1001,2000-01-18,2.0,0.832857,101.275714,6.692857,10.118571,6.711429,6.735714,16.931429,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
3,1001,2000-01-25,2.0,3.641429,100.187143,4.220000,4.458571,-0.224286,-0.132857,10.217143,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
4,1001,2000-02-01,1.0,3.617143,100.992857,3.178571,0.764286,-2.791429,-2.714286,6.275714,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,0.471429,82.668571,1.991429,-5.320000,-11.062857,-10.812857,-0.422857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759900,56043,2016-12-13,0.0,0.350000,82.865714,1.884286,-7.621429,-12.220000,-11.961429,-2.532857,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759901,56043,2016-12-20,0.0,0.812857,82.752857,1.757143,-9.688571,-13.777143,-13.368571,-3.805714,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1
2759902,56043,2016-12-27,0.0,0.751429,82.667143,2.022857,-6.534286,-11.001429,-10.767143,0.274286,...,0.000000,0.000000,0.000000,1,1,1,1,1,1,1


In [5]:
# Breaking out independent numerical variables from target variable, categorical variable ('fips'), and date.
cols = tsm.columns.tolist()
features = cols[3:]

# Separating out the features
X = tsm.loc[:, features].values

# Separating out the target
y = tsm.loc[:,['score']].values

# Standardizing the features
X = StandardScaler().fit_transform(X)

In [6]:
# Converting fips (county number) to dataframe to reconstruct dataframe after Standardization.
fips = tsm['fips']
fips_df = pd.DataFrame(fips, index=tsm.index, columns=['fips'])

# Converting date to dataframe to reconstruct dataframe after Standardization.
date = tsm['date']
date_df = pd.DataFrame(date, index=tsm.index, columns=['date'])

# Converting score to dataframe to reconstruct dataframe after Standardization.
y_df = pd.DataFrame(y, index=tsm.index, columns=['score'])

# Converting features to dataframe to reconstruct dataframe after Standardization.
X_df = pd.DataFrame(X, index=tsm.index, columns=features)

In [7]:
# Merges fips to date
tsm_scaled = pd.concat([fips, date_df], axis=1)

# Merges fips & date, to score
tsm_scaled = pd.concat([tsm_scaled, y_df], axis=1)

# Merges fips, date, & score, to features
tsm_scaled = pd.concat([tsm_scaled, X_df], axis=1)

In [8]:
tsm_scaled

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2000-01-04,1.0,0.774991,0.685097,0.399348,0.240823,0.607255,0.606663,0.224313,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
1,1001,2000-01-11,2.0,0.997975,0.738647,-0.425000,-0.341722,-0.214522,-0.216637,-0.266621,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
2,1001,2000-01-18,2.0,-0.594568,0.851009,-0.255744,-0.253852,-0.024554,-0.025900,-0.157899,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
3,1001,2000-01-25,2.0,0.327381,0.650493,-0.820257,-0.791107,-0.755375,-0.753150,-0.766951,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
4,1001,2000-02-01,1.0,0.319409,0.798907,-1.057998,-1.141773,-1.025876,-1.026474,-1.124477,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,-0.713212,-2.576446,-1.329003,-1.719301,-1.897441,-1.883959,-1.732103,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759900,56043,2016-12-13,0.0,-0.753072,-2.540132,-1.353462,-1.937756,-2.019370,-2.005570,-1.923501,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759901,56043,2016-12-20,0.0,-0.601133,-2.560920,-1.382487,-2.133972,-2.183447,-2.154560,-2.038961,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759902,56043,2016-12-27,0.0,-0.621298,-2.576709,-1.321828,-1.834563,-1.890968,-1.879118,-1.668865,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884


In [9]:
tsm_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759904 entries, 0 to 2759903
Data columns (total 88 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   score             float64       
 3   PRECTOT_mean      float64       
 4   PS_mean           float64       
 5   QV2M_mean         float64       
 6   T2M_mean          float64       
 7   T2MDEW_mean       float64       
 8   T2MWET_mean       float64       
 9   T2M_MAX_mean      float64       
 10  T2M_MIN_mean      float64       
 11  T2M_RANGE_mean    float64       
 12  TS_mean           float64       
 13  WS10M_mean        float64       
 14  WS10M_MAX_mean    float64       
 15  WS10M_MIN_mean    float64       
 16  WS10M_RANGE_mean  float64       
 17  WS50M_mean        float64       
 18  WS50M_MAX_mean    float64       
 19  WS50M_MIN_mean    float64       
 20  WS50M_RANGE_mean  float64       
 21  PRECTOT_

In [10]:
tsm_scaled.to_csv(local_data + 'train_soil_stats_scaled.csv',
                       index_label='index')