# Cleaning data from the raw csv's

### Imports and directions definition

In [None]:
import os
from pathlib import Path
import pandas as pd

# We'll have the datasets path here for easy access
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
DATASETS_DIR = os.path.join(Path(CURRENT_DIR).parent.parent, "data")

FIRST_DATA_DIR = os.path.join(DATASETS_DIR, "first")
REEDITION_DIR = os.path.join(DATASETS_DIR, "reedition")
NEW_DATA_DIR = os.path.join(DATASETS_DIR, "new")

# Then every dataset path will be defined here
#   SM = shallow moonquake, DM = deep moonquake, AI = artificial impact, M = meteorite
#   a = arrivals, l = locations
#   n = new
SMl_DIR = os.path.join(REEDITION_DIR, "nakamura_1979_sm_locations.csv")
SMa_DIR = os.path.join(REEDITION_DIR, "nakamura_1983_sm_arrivals.csv")

DMl_DIR = os.path.join(REEDITION_DIR, "nakamura_2005_dm_locations.csv")
DMa_DIR = os.path.join(REEDITION_DIR, "nakamura_2005_dm_arrivals.csv")

AIl_DIR = os.path.join(REEDITION_DIR, "nakamura_1983_ai_locations.csv")
AIa_DIR = os.path.join(REEDITION_DIR, "nakamura_1983_ai_arrivals.csv")

Ma_DIR = os.path.join(REEDITION_DIR, "nakamura_1983_m_arrivals.csv")

L_DIR = os.path.join(REEDITION_DIR, "lognonne_2003_catalog.csv")

DMn_DIR = os.path.join(NEW_DATA_DIR, "Deep_Moonquakes_Areas.csv")
DMn_DIR_JSON = os.path.join(NEW_DATA_DIR, "Deep_Moonquakes_Areas.json")
SMn_DIR = os.path.join(NEW_DATA_DIR, "Shallow_Moonquakes_Areas.csv")
SMn_DIR_JSON = os.path.join(NEW_DATA_DIR, "Shallow_Moonquakes_Areas.json")
Mn_DIR = os.path.join(NEW_DATA_DIR, "Meteoroid_Impact_Areas.csv")
Mn_DIR_JSON = os.path.join(NEW_DATA_DIR, "Meteoroid_Impact_Areas.json")
Ln_DIR = os.path.join(NEW_DATA_DIR, "Catalogued_Events.csv")
Ln_DIR_JSON = os.path.join(NEW_DATA_DIR, "Catalogued_Events.json")

### Deep Moonquakes (DM) Areas Generation

In [None]:
# Columns to be used in the new dataset: 
    #   Area_ID (int),                                      -> Positive Integers
    #   Longitude (grades), Longitude_Error (grades),       -> Float
    #   Latitude (grades), Latitude_Error (grades),         -> Float
    #   Depth (km), Depth_Error (km),                       -> Positive Integers
    #   12_P_Mean (), 12_S_Mean (),                         -> Float
    #   14_P_Mean (), 14_S_Mean (),                         -> Float
    #   15_P_Mean (), 15_S_Mean (),                         -> Float
    #   16_P_Mean (), 16_S_Mean (),                         -> Float

new_columns = {
    "A": "Area_ID",
    "Long": "Longitude",
    "Long_err": "Longitude_Error",
    "Lat": "Latitude",
    "Lat_err": "Latitude_Error",
    "Depth" : "Depth",
    "Depth_err": "Depth_Error",
    "12P": "12P_Mean",
    "12S": "12S_Mean",
    "14P": "14P_Mean",
    "14S": "14S_Mean",
    "15P": "15P_Mean",
    "15S": "15S_Mean",
    "16P": "16P_Mean",
    "16S": "16S_Mean",
}

columns_dtypes = {
    "Area_ID": int,
    "Longitude": float,
    "Longitude_Error": float,
    "Latitude": float,
    "Latitude_Error": float,
    "Depth": float,
    "Depth_Error": float,
    "12P_Mean": float,
    "12S_Mean": float,
    "14P_Mean": float,
    "14S_Mean": float,
    "15P_Mean": float,
    "15S_Mean": float,
    "16P_Mean": float,
    "16S_Mean": float,
}

dml_df = pd.read_csv(DMl_DIR)
dma_df = pd.read_csv(DMa_DIR)
ndm_df = dml_df.copy()

ndm_df = pd.merge(ndm_df, dma_df, how="outer").sort_values(by="A")

ndm_df = (
    ndm_df.rename(columns=new_columns)
    .drop(["Assumed","Side"], axis=1)
    .replace([""," "], 0).dropna()
    .astype(columns_dtypes, copy=True)
)

ndm_df.to_csv(DMn_DIR,index=False,encoding="utf-8")
ndm_df.to_json(DMn_DIR_JSON,index=False,encoding="utf-8")

### Events (lognonne) Data generation

In [None]:
import datetime
from constants.dir import *
import pandas as pd

ldf = pd.read_csv(L_DIR)

columns_renames = {
    "Type": "type",
    "Latitude": "latitude",
    "Longitude": "longitude",
    "Depth": "depth",
    "Delta-a": "delta_a",
    "Delta-b": "delta_b",
    "Phi": "phi",
    "Depth_Error": "depth_error",
    "Time_err": "time_error",
    "Date": "date",
    "12P": "p12",
    "12S": "s12",                   
    "14P": "p14",
    "14S": "s14",                   
    "15P": "p15",
    "15S": "s15",                   
    "16P": "p16",
    "16S": "s16",
    "12P_EC": "p12_ec",
    "12S_EC": "s12_ec",                   
    "14P_EC": "p14_ec",
    "14S_EC": "s14_ec",                   
    "15P_EC": "p15_ec",
    "15S_EC": "s15_ec",                   
    "16P_EC": "p16_ec",
    "16S_EC": "s16_ec"
}

columns_dtypes = {
    "Type": str,
    "Latitude": float,
    "Longitude": float,
    "Depth": float,
    "Delta-a": float,
    "Delta-b": float,
    "Phi": float,
    "Depth_Error": float,
    "Date": "datetime64[ns]",
    "12P": float,
    "12S": float,                   
    "14P": float,
    "14S": float,                   
    "15P": float,
    "15S": float,                   
    "16P": float,
    "16S": float,
    "12P_EC": float,
    "12S_EC": float,                   
    "14P_EC": float,
    "14S_EC": float,                   
    "15P_EC": float,
    "15S_EC": float,                   
    "16P_EC": float,
    "16S_EC": float
}

sepparate_dates = (lambda x: f"{x[0:2]}-{x[2:4]}-{x[4:6]} {x[6:8]}:{x[8:]}")

# Getting rid out of / fixing detected and documented errors
ldf = ldf[ldf["Date"] > 1000000000]


ldf["Date"] = ldf["Date"].astype(str).apply(sepparate_dates)
ldf["Seconds"] = ldf["Seconds"].astype(int).astype(str).apply(lambda x: f"{x:0>2}")
ldf["Date"] = "19" + ldf["Date"] + ":" + ldf["Seconds"]
ldf["Date"] = (pd.to_datetime(ldf["Date"], format="%Y-%m-%d %H:%M:%S"))

ldf = (
    ldf.sort_values(by="Date")
    .rename(columns={"Long":"Longitude", "Lat":"Latitude", "Depth_err": "Depth_Error"})
    .astype(columns_dtypes)
)
ldf = ldf.fillna(0)
ldf = ldf.rename(columns=columns_renames).drop(["Seconds"], axis=1)

ldf.to_csv(Ln_DIR,index=False,encoding="utf-8")
ldf.to_json(Ln_DIR_JSON, indent=2, orient="table")