In [22]:
# Import libraries
from io import StringIO
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
import glob
from scipy.integrate import cumtrapz
from scipy.fft import fft
import numpy as np

# load data directories
lunar_test_data_directory = './space_apps_2024_seismic_detection/data/lunar/test/data'
lunar_training_data_directory = './space_apps_2024_seismic_detection/data/lunar/training/data/S12_GradeA/'

mars_test_data_directory = './space_apps_2024_seismic_detection/data/mars/test/data'
mars_training_data_directory = './space_apps_2024_seismic_detection/data/mars/training/data'

lunar_training_file = lunar_training_data_directory + 'xa.s12.00.mhz.1970-01-19HR00_evid00002.csv'

# l_training_df = pd.read_csv(lunar_training_file)
# # l_train_df = pd.read_csv()


In [10]:
# data cleaning tools

def clean_csv(file_path, output_path, log_file_path):
    df = pd.read_csv(file_path)

    # Initialize an empty list for log entries
    log_entries = []

    # Log missing values before dropping them
    if df.isnull().values.any():
        missing_data = df[df.isnull().any(axis=1)]
        for index, row in missing_data.iterrows():
            log_entries.append(f"Missing Data Removed: Index {index}, Data: {row.to_dict()}")

    # Remove rows with any missing values
    df.dropna(inplace=True)

    # Remove duplicate time entries (assuming 'time(%Y-%m-%dT%H:%M:%S.%f)' or 'time_abs(%Y-%m-%dT%H:%M:%S.%f)' is the time column)
    if 'time(%Y-%m-%dT%H:%M:%S.%f)' in df.columns:
        duplicate_rows = df[df.duplicated(subset='time(%Y-%m-%dT%H:%M:%S.%f)', keep=False)]
        for index, row in duplicate_rows.iterrows():
            log_entries.append(f"Duplicate Time Removed: Index {index}, Data: {row.to_dict()}")
        df.drop_duplicates(subset='time(%Y-%m-%dT%H:%M:%S.%f)', inplace=True)
    elif 'time_abs(%Y-%m-%dT%H:%M:%S.%f)' in df.columns:
        duplicate_rows = df[df.duplicated(subset='time_abs(%Y-%m-%dT%H:%M:%S.%f)', keep=False)]
        for index, row in duplicate_rows.iterrows():
            log_entries.append(f"Duplicate Time Removed: Index {index}, Data: {row.to_dict()}")
        df.drop_duplicates(subset='time_abs(%Y-%m-%dT%H:%M:%S.%f)', inplace=True)

    # Remove rows where velocity is -1 (assuming the column is named 'velocity(m/s)')
    if 'lunar' in file_path.lower():
        if 'velocity(m/s)' in df.columns:
            # Find rows where 'velocity(m/s)' is either -1 or -1.0
            unknown_velocity_data = df[(df['velocity(m/s)'] == -1) | (df['velocity(m/s)'] == -1.0)]
            
            # Log the data that is being removed
            for index, row in unknown_velocity_data.iterrows():
                log_entries.append(f"Unknown Velocity Removed (velocity=-1 or -1.0): Index {index}, Data: {row.to_dict()}")
            
            # Remove the rows where 'velocity(m/s)' is either -1 or -1.0
            df = df[(df['velocity(m/s)'] != -1) & (df['velocity(m/s)'] != -1.0)]

    # Save cleaned data to the output directory
    df.to_csv(output_path, index=False)

    # Write log entries to the log file
    with open(log_file_path, 'a') as log_file:
        log_file.write(f"Cleaning log for file: {file_path}\n")
        if log_entries:
            for entry in log_entries:
                log_file.write(entry + '\n')
        else:
            log_file.write("No data removed.\n")
        log_file.write("\n")

# Function to iterate over directories and clean CSV files
def clean_data_in_directory(input_directory, output_directory, log_file_path):
    # Traverse the directory tree
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith('.csv'):
                # Determine the current CSV file path
                csv_file = os.path.join(root, file)
                
                # Determine output path for the cleaned CSV
                relative_path = os.path.relpath(csv_file, input_directory)
                output_path = os.path.join(output_directory, relative_path)

                # Create any necessary output subdirectories
                os.makedirs(os.path.dirname(output_path), exist_ok=True)

                # Clean and save the CSV file while logging changes
                clean_csv(csv_file, output_path, log_file_path)
                print(f"Cleaned file saved to: {output_path}")

In [11]:
# Data cleaning
# Put desired clean output directories here
clean_data_directory = './clean_data'
lunar_test_clean_directory = os.path.join(clean_data_directory, 'lunar/test')
lunar_training_clean_directory = os.path.join(clean_data_directory, 'lunar/training')
mars_test_clean_directory = os.path.join(clean_data_directory, 'mars/test')
mars_training_clean_directory = os.path.join(clean_data_directory, 'mars/training')

# Ensure the clean directories exist
os.makedirs(lunar_test_clean_directory, exist_ok=True)
os.makedirs(lunar_training_clean_directory, exist_ok=True)
os.makedirs(mars_test_clean_directory, exist_ok=True)
os.makedirs(mars_training_clean_directory, exist_ok=True)

# Path to cleaning log file
log_file_path = './cleaning_log.txt'

# clean out log file before populating
open(log_file_path, 'w')

# Clean data in each of the specified directories
clean_data_in_directory(lunar_test_data_directory, lunar_test_clean_directory, log_file_path)
clean_data_in_directory(lunar_training_data_directory, lunar_training_clean_directory, log_file_path)
clean_data_in_directory(mars_test_data_directory, mars_test_clean_directory, log_file_path)
clean_data_in_directory(mars_training_data_directory, mars_training_clean_directory, log_file_path)


Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1972-11-14HR00_evid00081.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1974-12-25HR00_evid00174.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1975-03-26HR00_evid00186.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1973-07-31HR00_evid00123.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1974-05-19HR00_evid00146.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1972-11-08HR00_evid00080.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1975-02-19HR00_evid00180.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1977-04-17HR00_evid00249.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1972-11-06HR00_evid00079.csv
Cleaned file saved to: ./clean_data/lunar/test/S16_GradeA/xa.s16.00.mhz.1977-06-02HR00_evid

In [12]:
# create list of dataframes of clean data csv
# TODO: Not enough memory to use this

def populate_df_list(input_directory):
    output_df_list = []
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith('.csv'):
                csv_file = os.path.join(root, file)
                try:
                    df = pd.read_csv(csv_file)
                    output_df_list.append(df)
                except Exception as e:
                    print(f"Error reading {csv_file}: {e}")
    return output_df_list

# lunar_test_df_list = populate_df_list('./clean_data/lunar/test')
# lunar_training_df_list = populate_df_list(lunar_training_clean_directory)
# mars_test_df_list = populate_df_list(mars_training_clean_directory)
# mars_training_df_list = populate_df_list(mars_test_clean_directory)

In [25]:
# TODO: alternative data tools
def convert_velocity_to_displacement(input_csv, output_csv):
    """
    Converts velocity data from CSV to displacement and writes to a new CSV.
    """
    df = pd.read_csv(input_csv)

    time = df['time_rel(sec)'].values
    velocity = df['velocity(m/s)'].values

    # Integrate velocity to obtain displacement
    displacement = cumtrapz(velocity, time, initial=0)  # initial=0 to start displacement at zero

    displacement_df = df.copy()
    displacement_df['displacement(m)'] = displacement

    displacement_df.to_csv(output_csv, index=False)

    print(f"Displacement data written to {output_csv}")
    
def convert_velocity_to_acceleration(input_csv, output_csv):
    """
    Converts velocity data from CSV to acceleration and writes to a new CSV.
    """
    df = pd.read_csv(input_csv)

    time = df['time_rel(sec)'].values
    velocity = df['velocity(m/s)'].values

    acceleration = np.gradient(velocity, time)  # differentiation to get acceleration

    acceleration_df = df.copy()
    acceleration_df['acceleration(m/s^2)'] = acceleration

    acceleration_df.to_csv(output_csv, index=False)

    print(f"Acceleration data written to {output_csv}")


# convert_velocity_to_displacement(lunar_training_file, './lunar_displacement.csv')
# convert_velocity_to_acceleration(lunar_training_file, './lunar_acceleration.csv')

def plot_displacement_over_time(input_csv):
    """
    Plots displacement over time from a CSV file containing displacement data.
    """
    df = pd.read_csv(input_csv)

    time = pd.to_datetime(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'])
    displacement = df['displacement(m)'].values

    plt.figure(figsize=(10, 6))
    plt.plot(time, displacement, label='Displacement Over Time', color='b')
    plt.xlabel('Time (Day Hour:Minute)')
    plt.ylabel('Displacement (meters)')
    plt.title('Displacement Over Time')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

# plot_displacement_over_time('lunar_displacement.csv')

def plot_acceleration_over_time(input_csv):
    """
    Plots acceleration over time from a CSV file containing acceleration data.
    """
    df = pd.read_csv(input_csv)

    time = pd.to_datetime(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'])
    acceleration = df['acceleration(m/s^2)'].values

    plt.figure(figsize=(10, 6))
    plt.plot(time, acceleration, label='Acceleration Over Time', color='r')
    plt.xlabel('Time (Day Hour:Minute)')
    plt.ylabel('Acceleration (m/s²)')
    plt.title('Acceleration Over Time')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_acceleration_over_time('lunar_acceleration.csv')

def plot_frequency_spectrum(input_csv):
    df = pd.read_csv(input_csv)
    time = df['time_rel(sec)'].values
    velocity = df['velocity(m/s)'].values
    
    # Apply fast fourier transform to velocity data
    N = len(velocity)
    velocity_fft = fft(velocity)
    frequency = np.fft.fftfreq(N, d=(time[1] - time[0]))

    # Plot frequency spectrum
    plt.figure(figsize=(10, 6))
    plt.plot(frequency[:N // 2], np.abs(velocity_fft)[:N // 2], label='Frequency Spectrum')
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Amplitude')
    plt.title('Frequency Spectrum of    Velocity Data')
    plt.grid(True)
    plt.legend()
    plt.show()

# Example usage
# plot_frequency_spectrum(lunar_training_file)