In [None]:
import pandas as pd
import numpy as np
import os
import datetime

In [None]:
processed_folder_path = os.path.join("data", "processed")
unprocessed_folder_path = os.path.join("data", "unprocessed")

In [None]:
def pandas_eda(df):
    print("First 5 columns")
    print(df.head())
    print("*" * 100)
    print("Dataframe information")
    print(df.info())
    print("*" * 100)
    print("Missing values")
    print(df.isnull().sum())
    print("*" * 100)

In [None]:
# Convert data files from NDSI/NDVI to formatted CSVs
data_folder = "Data_NDSI_NDVI"
data_files = ["NDSI.txt", "NDVI.txt"]


for data_file in data_files:
    df = pd.read_csv(os.path.join(unprocessed_folder_path, data_folder, data_file), 
                     delimiter="\t", index_col=False,
                     names=["Watershed", "Subsubwatershed", "Product", "Date",
                            "Areaini", "Areareproj", "SurfNDSImax", "SurfNDSImin",
                            "SurfNDSIavg", "NDSImax", "NDSImin", "NDSIavg",
                            "Surfcloudmax", "Surfcloudmin", "Surfcloudavg",
                            "Surfbadpixmax", "Surfbadpixmin", "Surfbadpixavg"])
    df["Date"] = pd.to_datetime(df["Date"])
    pandas_eda(df)
    df.to_csv(os.path.join(processed_folder_path, data_file[:4] + ".csv"))



In [None]:
flow_data_folder = "Data_RiverFlow"
flow_data_file = "DGA.txt"

date_columns = ["day", "month", "year", "hour"]

df = pd.read_csv(os.path.join(unprocessed_folder_path, flow_data_folder, flow_data_file), 
                 delimiter="\t", index_col=False,
                 names=["station_number", "day", "month", "year", "hour", 
                        "river_height", "river_flow", "information", "origin"])

# print(df.describe())
date = pd.to_datetime(dict(year=df.year, month=df.month, day=df.day, hour=df.hour))

df = df.drop(columns=date_columns)
df.insert(1, 'date', date)

df.to_csv(os.path.join(processed_folder_path, flow_data_file[:3] + ".csv"))