# <u>Drought Prediction</u>: Preprocessing - Resample [Mean, Max, Min], Merge

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn
import pandas as pd
import numpy as np

#### Load Training Dataset and Soil Dataset.  Convert Training Dataset date variable from object to datetime.

In [2]:
# Local location of the data
# local_data = 'D:\\Data_Science\\DroughtProject\\Data\\' # Location on Windows
local_data = '/home/chad/Data/Drought_Prediction/' # Location on Linux

# Load the training set and the soil variables.
soil_set = pd.read_csv(local_data + 'soil_data.csv')
test_set = pd.read_csv(local_data + 'test_timeseries.csv',
                        parse_dates=['date'],
                        header=0)
val_set = pd.read_csv(local_data + 'validation_timeseries.csv',
                        parse_dates=['date'],
                        header=0)

#### Confirm datasets are properly loaded and contain expected datatypes.

In [None]:
soil_set.info()

In [None]:
test_set.info()

In [None]:
val_set.info()

In [None]:
soil_set.head()

In [None]:
test_set.head()

In [None]:
val_set.head()

## Combine Test & Validation Datasets

In [None]:
# Using concat to merge test and validation datasets.
testval = pd.concat([val_set, test_set], axis=0, ignore_index=True)

In [None]:
testval.info()

In [None]:
testval

### Resample Meteorolgical Variables to weekly variables with non-null Score values.

In [None]:
# For each county ('fips'), since the score value is set on Tuesday, 
# all variables are averaged from the week leading up to Tuesday: previous Wednesday to Tuesday.
testval_mean = test_set.groupby('fips').resample('W-TUE', on='date').mean()
testval_mean

In [None]:
# The Groupby returns a MultiIndex.
testval_mean.index

In [None]:
# 'fips' is both part of the MultiIndex and a copied column.
#  Need to rename (or delete) before resetting the index.
testval_mean.rename({'fips': 'fips_copy'}, axis=1, inplace=True)

testval_mean.reset_index(inplace=True)
testval_mean

In [None]:
# Confirming expected column dataypes, overall size, memory usage, etc.
testval_mean.info()

In [None]:
# The last Score value is NaN.  Filling that value with last valid value.
testval_mean.fillna(method='ffill', inplace=True)

In [None]:
# 'fips_copy' is a copy and has been verified as no longer needed.
testval_mean.drop('fips_copy', axis=1, inplace=True)

In [None]:
# Confirming proper structure and expected output.
testval_mean

### Repeat process of resampling but use the max() value instead of mean()

In [None]:
# For each county ('fips'), since the score value is set on Tuesday, 
# find the max for all variables from the week leading up to Tuesday: previous Wednesday to Tuesday.
train_set_max = train_set.groupby('fips').resample('W-TUE', on='date').max()

train_set_max.rename({'fips': 'fips_copy', 'date': 'date_copy'}, axis=1, inplace=True)
train_set_max.reset_index(inplace=True)
train_set_max.fillna(method='ffill', inplace=True)
train_set_max.drop(['fips_copy', 'date_copy'], axis=1, inplace=True)

train_set_max

### Repeat process of resampling but use the min() value instead of mean()

In [None]:
# CREATE FUNCTION FOR THIS PROCESS

# For each county ('fips'), since the score value is set on Tuesday, 
# find the max for all variables from the week leading up to Tuesday: previous Wednesday to Tuesday.
train_set_min = train_set.groupby('fips').resample('W-TUE', on='date').min()

train_set_min.rename({'fips': 'fips_copy', 'date': 'date_copy'}, axis=1, inplace=True)
train_set_min.reset_index(inplace=True)
train_set_min.fillna(method='ffill', inplace=True)
train_set_min.drop(['fips_copy', 'date_copy'], axis=1, inplace=True)

train_set_min

### Merge Mean, Min, & Max Resample

In [None]:
# Add '_mean' suffix so when the tables are joined, the variable suffixes have a standard meaning.
train_set_mean = train_set_mean.add_suffix('_mean')
train_set_max = train_set_max.add_suffix('_max')
train_set_min = train_set_min.add_suffix('_min')

In [None]:
train_stats_temp = train_set_mean.join(train_set_max, how='inner', rsuffix = '_max')
train_stats_temp

In [None]:
train_stats =  train_stats_temp.join(train_set_min, how='inner', rsuffix='_min')
train_stats

In [None]:
# The Date, FIPS, and Score values don't have min or max values different from mean and are therefore duplicates.
train_stats.drop(['fips_max', 'date_max', 'score_max', 'fips_min', 'date_min', 'score_min'], axis=1, inplace=True)

In [None]:
train_stats

In [None]:
train_stats.rename({'fips_mean': 'fips', 'date_mean': 'date', 'score_mean':'score'}, axis=1, inplace=True)

In [None]:
train_stats.info()

In [None]:
# Rearranging the columns so that fips, date, and score are the first three columns.

cols = train_stats.columns.tolist()
cols = cols[0:2] + [cols[20]] + cols[2:20] + cols[21:]
# type(cols)
cols

In [None]:
train_stats = train_stats[cols]
train_stats

In [None]:
train_stats.describe()

### Directly using Merge to correctly join on specified column

In [None]:
train_soil_stats = pd.merge(train_stats, soil_set, on='fips', how='inner')

In [None]:
train_soil_stats

#### There are the same number of rows in the training set and the merged dataset.

In [None]:
train_soil_stats.info()

### Exporting the Merged Training and Soil Dataset 

In [None]:
# Export the merged training (meteorological) data that has been resampled with mean values
# and the soil data that does not vary with time.
train_soil_stats.to_csv(local_data + 'train_soil_stats.csv',
                       index_label='index')