# Data Organization for Seismic Wave Analysis

This notebook handles the organization of seismic wave data for model training. The main objectives are:

1. Split the filtered dataset into training (80%) and testing (20%) sets
2. Move files to their respective directories
3. Verify the data distribution

## Directory Structure
- `normalized_filtered_dataset/`: Contains the preprocessed and filtered seismic data
- `data/training/`: Destination for training data (80%)
- `data/testing/`: Destination for testing data (20%) 

In [2]:
import pandas as pd 
import os 
import shutil
import numpy as np
from obspy import read
import matplotlib.pyplot as plt
import scipy
from sklearn.model_selection import train_test_split

In [3]:
filtered_dataset_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/normalized_filtered_dataset'
train_dataset_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/training'
test_dataset_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/testing'
data_csv_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/raw/VT_P_training.csv'

files = os.listdir(filtered_dataset_path)
print(f'Total files in filtered dataset: {len(files)}')

Total files in filtered dataset: 2476


In [8]:
df = pd.DataFrame(files, columns=['filename'])
df['filename'] = df['filename'].str.replace('.mseed', '') 
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# print the number of files in each set
print(f'Total files in training set: {len(train_set)}')
print(f'Total files in testing set: {len(test_set)}')

Total files in training set: 1980
Total files in testing set: 496


In [9]:
def move_files(df, source_path, dest_path):
    """
    Move files from source_path to dest_path based on the filenames in the dataframe.	
    """
    for index, row in df.iterrows():
        filename = row['filename']
        source_file = os.path.join(source_path, filename + '.mseed')
        dest_file = os.path.join(dest_path, filename + '.mseed')
        if os.path.exists(source_file):
            shutil.move(source_file, dest_file)
            print(f'Moved {source_file} to {dest_file}')
        else:
            print(f'File {source_file} does not exist')

In [None]:
# move train_set
move_files(train_set, filtered_dataset_path, train_dataset_path)

# move test_set
move_files(test_set, filtered_dataset_path, test_dataset_path)

In [5]:
files_train = os.listdir(train_dataset_path)
files_test = os.listdir(test_dataset_path)
files = os.listdir(filtered_dataset_path)
print(f'Total files in filtered dataset: {len(files)}')
print(f'Total files in training dataset: {len(files_train)}')
print(f'Total files in test dataset: {len(files_test)}')


Total files in filtered dataset: 2476
Total files in training dataset: 317
Total files in test dataset: 496


In [44]:
train_set.head()

Unnamed: 0,filename
51,1062222
198,1220330
765,4301304
631,4190209
332,2131016
