# 2. Data Preparation
## 2.2 Supplementing station data
### Set up environment

In [1]:
# Import packages.
import numpy as np
import pandas as pd


In [2]:
# Load station data.
unique_stations = pd.read_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\unique_stations.csv')
unique_stations.head()


Unnamed: 0,Region,Basin Name,CITY,STATION
0,Connecticut River,CONNECTICUT,Amherst,AMH307
1,Connecticut River,CONNECTICUT,Amherst,AMHNWS
2,Connecticut River,DEERFIELD,Ashfield,ASFNWS
3,Central,NASHUA,Ashburnham,ASHNWS
4,Central,MILLERS,Athol,ATH404


In [3]:
# Load precipitation data.
precipitation_flat = pd.read_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\precipitation_flat.csv')
precipitation_flat.head()


Unnamed: 0,Station,Year,Month,Precipitation
0,AMH307,1997,1,
1,AMH307,1997,2,
2,AMH307,1997,3,3.04
3,AMH307,1997,4,
4,AMH307,1997,5,5.93


### Multi-index precipitation data

Utilizing an indexed dataframe will allow me to select datapoints by thier characteristics (e.g. a specific station at a specific time) while a flat dataframe will allow me to access data postionally (e.g., the 462nd row). 

<font size="1.2"> Technical note: I will be creating a dataframe with a multi-level index (class: pandas.core.frame.DataFrame), not a multi-index object (class: pandas.core.indexes.multi.MultiIndex). I chose this because converting to a MultiIndex object changed all NaN values to -1 and dataframes display much more nicely and easily than MultiIndex objects. To display the precip_multi table as a MultiIndex object, use the precip_multi.index command. </font>


In [4]:
# Multi-index the dataframe.
precipitation_multi = precipitation_flat.set_index(['Station', 'Year', 'Month'])
#precipitation_multi.head()


In [5]:
# Test if multi-level indexing works.

# precipitation_multi.loc[('AMH307', 'Precipitation')] # Select all records from station AMH307.
# precipitation_multi.loc[('AMH307', 2000), 'Precipitation'] # Select all records from station AMH307 in 1997.
# precipitation_multi.loc[('AMH307', 1997, 3), 'Precipitation'] # Select January record from station AMH307 in 1997.
# test = len(precipitation_multi.loc[('AMH307',)])/12 # Count years of data for station AMH307.
# print(f'Station AMH307 has {int(test)} years of data.') 


### Calculate station record completeness

I extract the first and last year of data collection from each stations record, as well as how many months were sampled. After this data is added to the station's record, I calculate the percent of months where no data was reported (the record's coverage). 

In [6]:
station_list = unique_stations['STATION']
n_stations = len(station_list)

first_year_reporting = []
last_year_reporting = []
n_months_total = []

for station in range(0, n_stations):
    current_station = station_list.iloc[station] # String
    
    n_months = len(precipitation_multi.loc[(current_station, ) ]) # Integer
    n_months_total.append(n_months)
    
    stations_earliest_year = precipitation_flat[precipitation_flat['Station'] == current_station].min()
    stations_earliest_year = stations_earliest_year.iloc[1]
    first_year_reporting.append(stations_earliest_year)
    
    stations_latest_year = precipitation_flat[precipitation_flat['Station'] == current_station].max()
    stations_latest_year = stations_latest_year.iloc[1]
    last_year_reporting.append(stations_latest_year)

unique_stations['First_year_collected'] = first_year_reporting
unique_stations['Last_year_collected'] = last_year_reporting
unique_stations['Months_sampled'] = n_months_total

unique_stations.head()

Unnamed: 0,Region,Basin Name,CITY,STATION,First_year_collected,Last_year_collected,Months_sampled
0,Connecticut River,CONNECTICUT,Amherst,AMH307,1997,2019,264
1,Connecticut River,CONNECTICUT,Amherst,AMHNWS,1838,2015,2136
2,Connecticut River,DEERFIELD,Ashfield,ASFNWS,1992,2015,288
3,Central,NASHUA,Ashburnham,ASHNWS,1942,2015,888
4,Central,MILLERS,Athol,ATH404,1912,2019,1296


In [7]:
# Count how many months reported were no-data-collected months.

n_months_NaN = []

for station in unique_stations['STATION']:
    n_nans = 0
    station_data = precipitation_flat[precipitation_flat['Station'] == station]
    for value in station_data['Precipitation']:
        if np.isnan(value):
            n_nans = n_nans + 1
    n_months_NaN.append(n_nans)

unique_stations['Months_not_reported'] = n_months_NaN


In [8]:
# Calculate percent of months reporting non-NaN data and add as column. 
percent_coverage = [1 - (n_months_NaN / n_months_total) for n_months_NaN, n_months_total in zip(n_months_NaN, n_months_total)]
unique_stations['Percent_months_reported'] = percent_coverage


In [9]:
unique_stations.head()

Unnamed: 0,Region,Basin Name,CITY,STATION,First_year_collected,Last_year_collected,Months_sampled,Months_not_reported,Percent_months_reported
0,Connecticut River,CONNECTICUT,Amherst,AMH307,1997,2019,264,24,0.909091
1,Connecticut River,CONNECTICUT,Amherst,AMHNWS,1838,2015,2136,200,0.906367
2,Connecticut River,DEERFIELD,Ashfield,ASFNWS,1992,2015,288,5,0.982639
3,Central,NASHUA,Ashburnham,ASHNWS,1942,2015,888,59,0.933559
4,Central,MILLERS,Athol,ATH404,1912,2019,1296,120,0.907407


### Concatenate station coordinates

I'll use coordinate data from a related station characteristics dataset to calculate and append the decimal degree latitude and longitude of the weather stations.

In [11]:
# Load supplemental station dataset.
supplemental_data = pd.read_excel(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Precip Station_lat long_110422.xlsx')
supplemental_data.head()


Unnamed: 0,TABLE_NAME,GAGE_NAME,GAGE_TYPE,EXPOSURE,LAT_DEG,LAT_MIN,LAT_SEC,LON_DEG,LON_MIN,LON_SEC,Region,Basin Name,HCDSYR
0,AMH307,Amherst,,Very good,42,18.0,23.5,72,31.0,55.6,Connecticut River,CONNECTICUT,2000.0
1,AMHNWS,Amherst,"Standard 8""",excellent,42,23.0,7.0,72,32.0,16.3,Connecticut River,CONNECTICUT,2002.0
2,ASHNWS,Ashburnham,Standard,excellent,42,37.0,4.6,71,54.0,58.8,Central,NASHUA,1995.0
3,ATH404,Athol,4 inch plastic,excellent,42,35.0,9.9,72,14.0,35.2,Central,MILLERS,1966.0
4,ATT801,Attleboro,"4"" plastic",excellent,41,55.0,41.6,71,20.0,12.9,Southeast,TAUNTON,1966.0


In [12]:
# Set station name as index.
supplemental_data = supplemental_data.set_index(['TABLE_NAME'])


In [13]:
# Convert to decimal degrees.
latitude_dd = supplemental_data['LAT_DEG'] + (supplemental_data['LAT_MIN'] / 60) + (supplemental_data['LAT_SEC'] / 3600)
longitude_dd = supplemental_data['LON_DEG'] + (supplemental_data['LON_MIN'] / 60) + (supplemental_data['LON_SEC'] / 3600)

# Add the converted data to the station characteristics dataframe.
supplemental_data['latitude_dd'] = latitude_dd
supplemental_data['longitude_dd'] = longitude_dd
supplemental_data.head(10)


Unnamed: 0_level_0,GAGE_NAME,GAGE_TYPE,EXPOSURE,LAT_DEG,LAT_MIN,LAT_SEC,LON_DEG,LON_MIN,LON_SEC,Region,Basin Name,HCDSYR,latitude_dd,longitude_dd
TABLE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AMH307,Amherst,,Very good,42,18.0,23.5,72,31.0,55.6,Connecticut River,CONNECTICUT,2000.0,42.306528,72.532111
AMHNWS,Amherst,"Standard 8""",excellent,42,23.0,7.0,72,32.0,16.3,Connecticut River,CONNECTICUT,2002.0,42.385278,72.537861
ASHNWS,Ashburnham,Standard,excellent,42,37.0,4.6,71,54.0,58.8,Central,NASHUA,1995.0,42.617944,71.916333
ATH404,Athol,4 inch plastic,excellent,42,35.0,9.9,72,14.0,35.2,Central,MILLERS,1966.0,42.586083,72.243111
ATT801,Attleboro,"4"" plastic",excellent,41,55.0,41.6,71,20.0,12.9,Southeast,TAUNTON,1966.0,41.928222,71.336917
b,Westville Lake,Sutran,unknown,42,4.0,58.5,72,3.0,34.3,Central,QUINEBAUG,,42.082917,72.059528
BAR916,Barnstable,Standard,excellent,41,38.0,8.5,70,23.0,26.9,Cape Cod and Islands,CAPE COD,1993.0,41.635694,70.390806
BARCOE,Barre Falls Dam,"Standard 8""",excellent,42,26.0,24.8,72,1.0,31.9,Central,CHICOPEE,1993.0,42.440222,72.025528
BEL314,Belchertown,"Standard 8""",excellent,42,16.0,49.5,72,20.0,57.3,Connecticut River,CHICOPEE,1966.0,42.280417,72.34925
BEL736,Bellingham,Standard,fair,42,4.0,45.0,71,27.0,53.3,Southeast,BLACKSTONE,1966.0,42.079167,71.464806


In [14]:
# Create two columns to hold the latitude and longitude data.
# Not every stations will have a location so the coordinates are set to NaN as a baseline.

unique_stations['Latitude'] = np.nan
unique_stations['Longitude'] = np.nan
unique_stations.head()

Unnamed: 0,Region,Basin Name,CITY,STATION,First_year_collected,Last_year_collected,Months_sampled,Months_not_reported,Percent_months_reported,Latitude,Longitude
0,Connecticut River,CONNECTICUT,Amherst,AMH307,1997,2019,264,24,0.909091,,
1,Connecticut River,CONNECTICUT,Amherst,AMHNWS,1838,2015,2136,200,0.906367,,
2,Connecticut River,DEERFIELD,Ashfield,ASFNWS,1992,2015,288,5,0.982639,,
3,Central,NASHUA,Ashburnham,ASHNWS,1942,2015,888,59,0.933559,,
4,Central,MILLERS,Athol,ATH404,1912,2019,1296,120,0.907407,,


In [15]:
# Add station coordinates to the station dataset if available in the supplementa data.
for row in range(0,len(unique_stations['STATION'])):
    current_station = unique_stations['STATION'].iloc[row] # name
    if (current_station in supplemental_data.index) == True:
        lat = supplemental_data['latitude_dd'].loc[current_station]
        long = supplemental_data['longitude_dd'].loc[current_station]
    unique_stations.iloc[(row, 9)] = lat # 
    unique_stations.iloc[(row, 10)] = long
unique_stations.head()


Unnamed: 0,Region,Basin Name,CITY,STATION,First_year_collected,Last_year_collected,Months_sampled,Months_not_reported,Percent_months_reported,Latitude,Longitude
0,Connecticut River,CONNECTICUT,Amherst,AMH307,1997,2019,264,24,0.909091,42.306528,72.532111
1,Connecticut River,CONNECTICUT,Amherst,AMHNWS,1838,2015,2136,200,0.906367,42.385278,72.537861
2,Connecticut River,DEERFIELD,Ashfield,ASFNWS,1992,2015,288,5,0.982639,42.385278,72.537861
3,Central,NASHUA,Ashburnham,ASHNWS,1942,2015,888,59,0.933559,42.617944,71.916333
4,Central,MILLERS,Athol,ATH404,1912,2019,1296,120,0.907407,42.586083,72.243111


In [16]:
# Export data. Indices not preserved.
unique_stations.to_csv(r'C:\Users\evamb\OneDrive\Documents\Github\MAPrecipData\Data_Products\unique_stations.csv', index=False)
