# 2 Data Preparation
## 2.1 Formatting Raw Data
### Set up environment

In [1]:
# Import packages
import numpy as np
import pandas as pd


In [2]:
# Load dataset.
raw_data = pd.read_excel(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Precipitation Database 10.19.xls')
#raw_data.head()


### Subdivide data into tables

By making each table separate, I can add data from other sources in a modular fashion without cluttering up every other table.

In [3]:
# Create new table to hold Basin data.
basin_data = raw_data[['Region', 'Basin Name']]
unique_basins = basin_data.drop_duplicates()
unique_basins = unique_basins.sort_values(by=['Region', 'Basin Name']) # Alphabetical sorting is enforced.
#unique_basins


In [4]:
# Create new table to hold Station data.
station_data = raw_data[['Region', 'Basin Name', 'CITY', 'STATION']]
unique_stations = station_data.drop_duplicates()
unique_stations = unique_stations.sort_values(by=['STATION']) # Alphabetical sorting is enforced.
#unique_stations


In [5]:
# Create new table to hold precipitation data
precipitation_data = raw_data.iloc[:,:-4]
#precipitation_data.head()


### Flatten precipitation table

Tabular data can be 'flat', meaning the predictor variables are included as columns and related to each outcome value. Pulling these predictor variables out into a nested format produces an indexed table. A multi-indexed format allows the user to select and sub-select specific datapoints by their characteristics (the indices) instead of their position, but a flat format is required build predictive models.


In [6]:
# Flatten precip data by month
precipitation_flat = precipitation_data.iloc[:,2:].unstack() # Unstack/flatten precip data only.
precipitation_flat = precipitation_flat.values.tolist()


In [7]:
# Create month list w/ numeric values. 
# This will be a new column added to the unstacked precip data.
# 1 = Jan, 2 = Feb, etc.
months_flat = len(precipitation_data)*[1,2,3,4,5,6,7,8,9,10,11,12]


In [8]:
# Add the station and year twelve times for every row to the appropriate list.
# These will be new columns added to the unstacked precip data.

stations_flat = []
years_flat = []

table_length = len(precipitation_data)

for row in range(0, table_length):
    station_name = [precipitation_data.iloc[row,0]] # Select station name as list element.
    dozen_stations = station_name * 12
    stations_flat = stations_flat + (dozen_stations)
    
    years = [precipitation_data.iloc[row,1]] # Select year.
    dozen_years = years * 12
    years_flat = years_flat + (dozen_years)


In [9]:
# Convert the lists to a dictionary.
data = {'Station': stations_flat,
        'Year': years_flat,
        'Month': months_flat,
        'Precipitation': precipitation_flat}

# Convert the dictionary to a completely flattened dataframe.
precipitation_flat = pd.DataFrame(data)
#precipitation_flat.head()


### Export data tables

In [11]:
# Export data. Indices not preserved.
precipitation_flat.to_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\precipitation_flat.csv', index=False)
unique_stations.to_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\unique_stations.csv', index=False)
unique_basins.to_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\unique_basins.csv', index=False)
