In [None]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import os

In [None]:
processed_folder_path = os.path.join("..", "data", "processed")
unprocessed_folder_path = os.path.join("..", "data", "unprocessed")

In [None]:
def pandas_eda(df):
    print("First 5 columns")
    print(df.head())
    print("*" * 100)
    print("Dataframe information")
    print(df.info())
    print("*" * 100)
    print("Missing values")
    print(df.isnull().sum())
    print("*" * 100)

In [None]:
# Convert data files from NDSI/NDVI to formatted CSVs
data_folder = "Data_NDSI_NDVI"
data_files = ["NDSI.txt", "NDVI.txt"]


for data_file in data_files:
    df = pd.read_csv(os.path.join(unprocessed_folder_path, data_folder, data_file), 
                     delimiter="\t", index_col=False,
                     names=["Watershed", "Subsubwatershed", "Product", "Date",
                            "Areaini", "Areareproj", "SurfNDSImax", "SurfNDSImin",
                            "SurfNDSIavg", "NDSImax", "NDSImin", "NDSIavg",
                            "Surfcloudmax", "Surfcloudmin", "Surfcloudavg",
                            "Surfbadpixmax", "Surfbadpixmin", "Surfbadpixavg"])
    df["Date"] = pd.to_datetime(df["Date"])
    pandas_eda(df)
    df.to_csv(os.path.join(processed_folder_path, data_file[:4] + ".csv"))



In [None]:
flow_data_folder = "Data_RiverFlow"
flow_data_file = "DGA.txt"

date_columns = ["day", "month", "year", "hour"]

df = pd.read_csv(os.path.join(unprocessed_folder_path, flow_data_folder, flow_data_file), 
                 delimiter="\t", index_col=False,
                 names=["station_number", "day", "month", "year", "hour", 
                        "river_height", "river_flow", "information", "origin"])

# print(df.describe())
date = pd.to_datetime(dict(year=df.year, month=df.month, day=df.day, hour=df.hour))

df = df.drop(columns=date_columns)
df.insert(1, 'date', date)

df.to_csv(os.path.join(processed_folder_path, flow_data_file[:3] + ".csv"))

## Plot data density

### NDSI and NDVI data density given time

In [None]:
df_NDSI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0)
df_NDVI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0)

In [None]:
def plot_data_density(dfs, labels):
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(["year"]).size(), label=label)
    
    plt.title("Frequency of years in data")
    plt.xlabel("Year")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(["month"]).size(), label=label)

    plt.xticks(range(12), range(1, 13))
    plt.title("Frequency of months in data")
    plt.xlabel("Month")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

In [None]:
df_NDSI["Date"] = pd.to_datetime(df_NDSI["Date"])
df_NDSI["year"] = df_NDSI["Date"].apply(lambda x: "%d" % (x.year))
df_NDSI["month"] = df_NDSI["Date"].apply(lambda x: "%d" % (x.month)) 

df_NDVI["Date"] = pd.to_datetime(df_NDVI["Date"])
df_NDVI["year"] = df_NDSI["Date"].apply(lambda x: "%d" % (x.year))
df_NDVI["month"] = df_NDSI["Date"].apply(lambda x: "%d" % (x.month)) 

plot_data_density([df_NDSI, df_NDVI], ["NDSI", "NDVI"])

### DGA data density given time 

In [None]:
df_DGA = pd.read_csv(os.path.join(processed_folder_path, "DGA.csv"), index_col=0)

df_DGA["date"] = pd.to_datetime(df_DGA["date"])

df_DGA["year"] = df_DGA["date"].apply(lambda x: "%d" % (x.year))
df_DGA["month"] = df_DGA["date"].apply(lambda x: "%d" % (x.month)) 

df_DGA["year"] = df_DGA["year"].astype(int)

plot_data_density([df_DGA], ["DGA"])