#Group3_Data_Operations

In [None]:
# %pip install geoalchemy2

In [None]:
#@title Import relevant modules
import os
import sys
import pandas as pd
import geopandas as gpd
import sqlalchemy as sq
from matplotlib import pyplot as plt
from dotenv import load_dotenv

In [None]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)


In [None]:

load_dotenv()
PGUSER = os.getenv('POSTGRES_USER')
PGPW = os.getenv('POSTGRES_PW')
PGDB = os.getenv('POSTGRES_DB')

In [None]:
regions = "data/2006CensusAgRegions/gcar000b07a_e.dbf"
gdfRegions = gpd.read_file(regions, encoding='utf-8')

In [None]:
stations = "data/climate_station_list.csv"
dfStations = pd.read_csv(stations)

In [None]:

gdfStationsHly = gpd.GeoDataFrame(dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude))
gdfStationsHly = gdfStationsHly.loc[dfStations['HLY Last Year'] > 1995]

In [None]:
gdfStationsDly = gpd.GeoDataFrame(dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude))
gdfStationsDly = gdfStationsDly.loc[dfStations['HLY Last Year'] > 1995]

In [None]:
gdfRegions = gdfRegions.to_crs("EPSG:3347")
gdfStationsHly = gdfStationsHly.set_crs("EPSG:4326", allow_override=True)
gdfStationsHly = gdfStationsHly.to_crs("EPSG:3347")
gdfStationsDly = gdfStationsDly.set_crs("EPSG:4326", allow_override=True)
gdfStationsDly = gdfStationsDly.to_crs("EPSG:3347")

In [None]:
# verification


In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy/1.1)
gdfRegions.plot(ax=ax, cmap='Pastel1', edgecolor='black')
gdfStationsHly.plot(ax=ax, color='red', markersize=0.4)
plt.show()

In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy/1.1)
gdfRegions.plot(ax=ax, cmap='Pastel1', edgecolor='black')
gdfStationsDly.plot(ax=ax, color='red', markersize=0.4)
plt.show()

In [None]:
gdfRegions.head()

In [None]:
# remove rows with nan
gdfPruned = gdfRegions.copy(deep=True)
# keep ab mb sk
gdfPruned['PRuid'] = gdfPruned['PRuid'].astype(int)
gdfPruned = gdfPruned.loc[gdfPruned['PRuid'].isin([46,47,48])]


In [None]:
gdfPruned.set_crs("EPSG:4326", allow_override=True)
gdfProjected = gdfPruned.to_crs("EPSG:3347")

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny*2, maxy/1.2)
ax.set_xlim(minx*1.15, maxx/1.4)
gdfProjected.plot(ax=ax, cmap='Pastel1', edgecolor='black')
gdfStationsHly.plot(ax=ax, color='red', markersize=1)

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny*2, maxy/1.2)
ax.set_xlim(minx*1.15, maxx/1.4)
gdfProjected.plot(ax=ax, cmap='Pastel1', edgecolor='black')
gdfStationsDly.plot(ax=ax, color='red', markersize=1)

In [None]:
gdfStationsDly.count()

In [None]:
# reindex 
gdfProjected.reset_index(inplace=True)

In [None]:
gdfProjected

In [None]:
# gray in rgba
gdfProjected['color'] = "#808080"

In [None]:
gdfProjected['CRnum'] = 0

In [None]:
# list of 11 colors complimentary in rgba
colors = ["#d3d3d3", "#556b2f", "#008080", "#6495ed", "#00ff7f", "#db7093", "#f0e68c", "#ffa07a", "#ee82ee", "#1e90ff", "#eee8aa"]

In [None]:
# set first row color red in rgba
gdfProjected.at[0, 'color'] = colors[0]

# set second row color 
gdfProjected.at[1, 'color'] = colors[0]

# set third row color 
gdfProjected.at[2, 'color'] = colors[0]
gdfProjected.at[7, 'color'] = colors[0]

gdfProjected.at[0, 'CRnum'] = 3
gdfProjected.at[1, 'CRnum'] = 3
gdfProjected.at[2, 'CRnum'] = 3
gdfProjected.at[7, 'CRnum'] = 3



In [None]:

gdfProjected.at[3, 'color'] = colors[1]
gdfProjected.at[4, 'color'] = colors[1]
gdfProjected.at[5, 'color'] = colors[1]

gdfProjected.at[3, 'CRnum'] = 2
gdfProjected.at[4, 'CRnum'] = 2
gdfProjected.at[5, 'CRnum'] = 2

In [None]:

gdfProjected.at[6, 'color'] = colors[2]
gdfProjected.at[8, 'color'] = colors[2]
gdfProjected.at[9, 'color'] = colors[2]
gdfProjected.at[10, 'color'] = colors[2]

gdfProjected.at[6, 'CRnum'] = 1
gdfProjected.at[8, 'CRnum'] = 1
gdfProjected.at[9, 'CRnum'] = 1
gdfProjected.at[10, 'CRnum'] = 1

In [None]:

gdfProjected.at[11, 'color'] = "#FFFFFF"

In [None]:

gdfProjected.at[12, 'color'] = colors[3]
gdfProjected.at[13, 'color'] = colors[3]
gdfProjected.at[14, 'color'] = colors[3]

gdfProjected.at[12, 'CRnum'] = 8
gdfProjected.at[13, 'CRnum'] = 8
gdfProjected.at[14, 'CRnum'] = 8

In [None]:

gdfProjected.at[15, 'color'] = colors[4]
gdfProjected.at[16, 'color'] = colors[4]
gdfProjected.at[17, 'color'] = colors[4]

gdfProjected.at[15, 'CRnum'] = 9
gdfProjected.at[16, 'CRnum'] = 9
gdfProjected.at[17, 'CRnum'] = 9

In [None]:

gdfProjected.at[18, 'color'] = colors[10]
gdfProjected.at[19, 'color'] = colors[10]

gdfProjected.at[18, 'CRnum'] = 10
gdfProjected.at[19, 'CRnum'] = 10

In [None]:

gdfProjected.at[20, 'color'] = colors[5]
gdfProjected.at[21, 'color'] = colors[5]
gdfProjected.at[22, 'color'] = colors[5]
gdfProjected.at[23, 'color'] = colors[5]
gdfProjected.at[25, 'color'] = colors[5]

gdfProjected.at[20, 'CRnum'] = 4
gdfProjected.at[21, 'CRnum'] = 4
gdfProjected.at[22, 'CRnum'] = 4
gdfProjected.at[23, 'CRnum'] = 4
gdfProjected.at[25, 'CRnum'] = 4

In [None]:

gdfProjected.at[24, 'color'] = colors[7]
gdfProjected.at[26, 'color'] = colors[7]
gdfProjected.at[27, 'color'] = colors[7]
gdfProjected.at[28, 'color'] = colors[7]
gdfProjected.at[29, 'color'] = colors[7]

gdfProjected.at[24, 'CRnum'] = 5
gdfProjected.at[26, 'CRnum'] = 5
gdfProjected.at[27, 'CRnum'] = 5
gdfProjected.at[28, 'CRnum'] = 5
gdfProjected.at[29, 'CRnum'] = 5

In [None]:

gdfProjected.at[30, 'color'] = colors[6]
gdfProjected.at[31, 'color'] = colors[6]
gdfProjected.at[32, 'color'] = colors[6]
gdfProjected.at[36, 'color'] = colors[6]
gdfProjected.at[37, 'color'] = colors[6]

gdfProjected.at[30, 'CRnum'] = 6
gdfProjected.at[31, 'CRnum'] = 6
gdfProjected.at[32, 'CRnum'] = 6
gdfProjected.at[36, 'CRnum'] = 6
gdfProjected.at[37, 'CRnum'] = 6

In [None]:

gdfProjected.at[33, 'color'] = colors[8]
gdfProjected.at[34, 'color'] = colors[8]
gdfProjected.at[35, 'color'] = colors[8]
gdfProjected.at[38, 'color'] = colors[8]
gdfProjected.at[39, 'color'] = colors[8]

gdfProjected.at[33, 'CRnum'] = 7
gdfProjected.at[34, 'CRnum'] = 7
gdfProjected.at[35, 'CRnum'] = 7
gdfProjected.at[38, 'CRnum'] = 7
gdfProjected.at[39, 'CRnum'] = 7

In [None]:
# plot using color column
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny*2, maxy/1.2)
ax.set_xlim(minx*1.15, maxx/1.4)
gdfProjected.plot(ax=ax, color=gdfProjected['color'], edgecolor='black')

In [None]:
# now the regions and stations are in the same projection
# the regions dataframe has a column called 'color' that has the color for each region and the number of the CR
# we will now add a column to the stations list for the CR number

In [None]:
# first we only keep ALBERTA MANITOBA SASKATCHEWAN in Province column from gdfStationsDly
gdfStationsDly = gdfStationsDly[gdfStationsDly['Province'].isin(['ALBERTA', 'MANITOBA', 'SASKATCHEWAN'])]

# then we add a column to gdfStationsDly called CRnum and set it to 0
gdfStationsDly['CRnum'] = 0

# then we loop through the rows in gdfStationsDly and for each row we loop through the rows in gdfProjected
# if the station is in the region we set the CRnum to the CRnum of the region
for index, row in gdfStationsDly.iterrows():
    for index2, row2 in gdfProjected.iterrows():
        if row['geometry'].within(row2['geometry']):
            gdfStationsDly.loc[index, 'CRnum'] = row2['CRnum']

In [None]:
# we do the same for gdfStationsHly
gdfStationsHly = gdfStationsHly[gdfStationsHly['Province'].isin(['ALBERTA', 'MANITOBA', 'SASKATCHEWAN'])]
gdfStationsHly['CRnum'] = 0

for index, row in gdfStationsHly.iterrows():
    for index2, row2 in gdfProjected.iterrows():
        if row['geometry'].within(row2['geometry']):
            gdfStationsHly.loc[index, 'CRnum'] = row2['CRnum']

In [None]:
gdfStationsDly.head()

In [None]:
# print unique CR numbers
gdfProjected['CRnum'].unique()

In [None]:
# print rows with CRnum = 0
gdfProjected[gdfProjected['CRnum'] == 0]

In [None]:
# remove index column from gdfProjected
gdfProjected = gdfProjected.drop(columns=['index'])

# reindex gdfProjected gdfStationsDly gdfStationsHly
gdfProjected = gdfProjected.reset_index(drop=True)
gdfStationsDly = gdfStationsDly.reset_index(drop=True)
gdfStationsHly = gdfStationsHly.reset_index(drop=True)

In [None]:
def pushGdfToPostGresDB(tableName: str, geodf: gpd.GeoDataFrame, db: str, user: str, pw: str) -> None:
    import geopandas as gpd
    
    sys.path.append('../')
    from DataService import DataService

    
    db_service = DataService(db, user, pw)
    db_con = db_service.connect()

    geodf.to_postgis(tableName, db_con, index=False, if_exists='replace')
    db_service.cleanup()

In [None]:

gdfProjected.rename(columns={gdfProjected.columns[0]: "car_uid"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[1]: "car_name"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[2]: "pr_uid"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[3]: "ag_uid"}, inplace=True)


gdfStationsDly.rename(columns={gdfStationsDly.columns[0]: "station_name"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[1]: "province"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[2]: "latitude"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[3]: "longitude"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[4]: "elevation"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[5]: "station_id"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[6]: "wmo_identifier"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[7]: "tc_identifier"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[8]: "first_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[9]: "last_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[10]: "hly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[11]: "hly_last_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[12]: "dly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[13]: "dly_last_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[14]: "mly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[15]: "mly_last_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[16]: "geometry"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[17]: "cr_num"}, inplace=True)

gdfStationsDly.loc[gdfStationsDly['province'] == 'ALBERTA', 'province'] = 'AB'
gdfStationsDly.loc[gdfStationsDly['province'] == 'MANITOBA', 'province'] = 'MB'
gdfStationsDly.loc[gdfStationsDly['province'] == 'SASKATCHEWAN', 'province'] = 'SK'

gdfStationsHly.rename(columns={gdfStationsHly.columns[0]: "station_name"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[1]: "province"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[2]: "latitude"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[3]: "longitude"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[4]: "elevation"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[5]: "station_id"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[6]: "wmo_identifier"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[7]: "tc_identifier"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[8]: "first_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[9]: "last_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[10]: "hly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[11]: "hly_last_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[12]: "dly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[13]: "dly_last_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[14]: "mly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[15]: "mly_last_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[16]: "geometry"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[17]: "cr_num"}, inplace=True)

gdfStationsHly.loc[gdfStationsHly['province'] == 'ALBERTA', 'province'] = 'AB'
gdfStationsHly.loc[gdfStationsHly['province'] == 'MANITOBA', 'province'] = 'MB'
gdfStationsHly.loc[gdfStationsHly['province'] == 'SASKATCHEWAN', 'province'] = 'SK'

# change next cell to code to push to DB

### push dataframes to postgres

tblName = 'census_ag_regions'
pushGdfToPostGresDB(tblName, gdfProjected, PGDB, PGUSER, PGPW)

tblName = 'stations_dly'
pushGdfToPostGresDB(tblName, gdfStationsDly, PGDB, PGUSER, PGPW)

tblName = 'stations_hly'
pushGdfToPostGresDB(tblName, gdfStationsHly, PGDB, PGUSER, PGPW)
