# Drought Prediction: Modeling

#### Load Libraries and dataset (non-standardized).

In [1]:
#Import pandas, numpy, and StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
# Local location of the data
# local_data = 'D:\\Data_Science\\DroughtProject\\Data\\' # Location on Windows
local_data = '/home/chad/Data/Drought_Prediction/' # Location on Linux

# Load the dataset that contains training (meteorological variables) resampled weekly with mean, max, min
# and the soil variables that have been merged on the county 'fips' value
tsm = pd.read_csv(local_data + 'train_soil_stats_scaled.csv',
                        parse_dates=['date'],
                        index_col=['index'],
                        header=0)

In [3]:
# Confirm dataset loaded properly.
tsm

Unnamed: 0_level_0,fips,date,score,PRECTOT_mean,PS_mean,QV2M_mean,T2M_mean,T2MDEW_mean,T2MWET_mean,T2M_MAX_mean,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1001,2000-01-04,1.0,0.774991,0.685097,0.399348,0.240823,0.607255,0.606663,0.224313,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
1,1001,2000-01-11,2.0,0.997975,0.738647,-0.425000,-0.341722,-0.214522,-0.216637,-0.266621,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
2,1001,2000-01-18,2.0,-0.594568,0.851009,-0.255744,-0.253852,-0.024554,-0.025900,-0.157899,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
3,1001,2000-01-25,2.0,0.327381,0.650493,-0.820257,-0.791107,-0.755375,-0.753150,-0.766951,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
4,1001,2000-02-01,1.0,0.319409,0.798907,-1.057998,-1.141773,-1.025876,-1.026474,-1.124477,...,0.872910,-0.242063,0.764426,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,0.703030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759899,56043,2016-12-06,0.0,-0.713212,-2.576446,-1.329003,-1.719301,-1.897441,-1.883959,-1.732103,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759900,56043,2016-12-13,0.0,-0.753072,-2.540132,-1.353462,-1.937756,-2.019370,-2.005570,-1.923501,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759901,56043,2016-12-20,0.0,-0.601133,-2.560920,-1.382487,-2.133972,-2.183447,-2.154560,-2.038961,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884
2759902,56043,2016-12-27,0.0,-0.621298,-2.576709,-1.321828,-1.834563,-1.890968,-1.879118,-1.668865,...,-0.895355,-0.340677,-0.967218,-0.696872,-0.542166,-0.281171,-0.347023,-0.159427,-0.14777,-0.279884


In [4]:
# Confirming expected datatypes.
tsm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2759904 entries, 0 to 2759903
Data columns (total 88 columns):
 #   Column            Dtype         
---  ------            -----         
 0   fips              int64         
 1   date              datetime64[ns]
 2   score             float64       
 3   PRECTOT_mean      float64       
 4   PS_mean           float64       
 5   QV2M_mean         float64       
 6   T2M_mean          float64       
 7   T2MDEW_mean       float64       
 8   T2MWET_mean       float64       
 9   T2M_MAX_mean      float64       
 10  T2M_MIN_mean      float64       
 11  T2M_RANGE_mean    float64       
 12  TS_mean           float64       
 13  WS10M_mean        float64       
 14  WS10M_MAX_mean    float64       
 15  WS10M_MIN_mean    float64       
 16  WS10M_RANGE_mean  float64       
 17  WS50M_mean        float64       
 18  WS50M_MAX_mean    float64       
 19  WS50M_MIN_mean    float64       
 20  WS50M_RANGE_mean  float64       
 21  PRECTOT_

### Create Pipeline

In [4]:
# Breaking out independent numerical variables from target variable, categorical variable ('fips'), and date.
cols = tsm.columns.tolist()
features = cols[3:]

# Separating out the features
X = tsm.loc[:, features].values

# Separating out the target
y = tsm.loc[:,['score']].values

# Split dataset into preliminary training set and initial test set for model selection.
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [5]:
type(X_train)

numpy.ndarray

In [6]:
# Create pipeline for Linear Regression
# lr = LinearRegression()
pipe = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    SelectKBest(f_regression),
    LinearRegression()
)

In [7]:
pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(score_func=<function f_regression at 0x7fe60ddc2670>)),
                ('linearregression', LinearRegression())])

In [8]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

In [9]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.11659248987548099, 0.11685947402155683)

In [10]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.8865398306786988, 0.8857586142381453)

### Create Cross-validation

In [11]:
# Calculate the mean of `y_train`
# lr_default_cv_results = cross_validate(lr, X, y, cv=5, n_jobs=-1)
cv_results = cross_validate(pipe, X_train, y_train, cv=5)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
# Fit the dummy regressor on the training data
# This is an absolute baseline for what is the guess for drought score without any model or variable input.
cv_scores = cv_results['test_score']
cv_scores

array([0.11591491, 0.11763351, 0.1153496 , 0.11655005, 0.11745928])

In [13]:
np.mean(cv_scores), np.std(cv_scores)

(0.11658146771662894, 0.0008763711784679433)

In [14]:
pipe25 = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    SelectKBest(f_regression, k=25),
    LinearRegression()
)

In [15]:
pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(score_func=<function f_regression at 0x7fe60ddc2670>)),
                ('linearregression', LinearRegression())])

###  Assess Model with Metrics