In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta

def concat_pkl_to_csv_with_timestamp(directory, output_csv_file):
    """
    Reads all .pkl files in the given directory, adds a timestamp column to simulate a 500 Hz sampling rate,
    and concatenates them into one .csv file.

    :param directory: The directory containing .pkl files.
    :param output_csv_file: The path to the output .csv file.
    """
    pkl_files = [file for file in os.listdir(directory) if file.endswith('.pkl')]
    df_list = []

    for file in pkl_files:
        file_path = os.path.join(directory, file)

        df = pd.read_pickle(file_path)

        # Assuming each file represents 10 seconds of data at 500 Hz
        # Creating a timestamp range for this duration
        num_samples = len(df)
        start_time = datetime(2020, 1, 1)  # Arbitrary start date and time
        timestamps = [start_time + timedelta(seconds=2/1000) * i for i in range(num_samples)]

        # Add the timestamp column
        df['date'] = timestamps
        df_list.append(df)

    final_df = pd.concat(df_list, ignore_index=True)
    final_df.to_csv(output_csv_file, index=False)

# Example usage
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'  # Replace with your directory path
output_csv_file = '/home/noam.koren/multiTS/NFT/data/ecg/add_freq.csv'
concat_pkl_to_csv_with_timestamp(directory, output_csv_file)


In [4]:
import pandas as pd
import os
from datetime import datetime, timedelta

def concat_pkl_to_csv_with_timestamp_and_rename(directory, output_csv_file):
    """
    Reads all .pkl files in the given directory, renames the first column to "OT", adds a timestamp column 
    to simulate a 500 Hz sampling rate, and concatenates them into one .csv file.

    :param directory: The directory containing .pkl files.
    :param output_csv_file: The path to the output .csv file.
    """
    pkl_files = [file for file in os.listdir(directory) if file.endswith('.pkl')]
    df_list = []

    for file in pkl_files:
        file_path = os.path.join(directory, file)
        df = pd.read_pickle(file_path)
        
        # Rename the first column to "OT"
        first_column_name = df.columns[0]
        df.rename(columns={first_column_name: 'OT'}, inplace=True)

        # Assuming each file represents 10 seconds of data at 500 Hz
        # Creating a timestamp range for this duration
        num_samples = len(df)
        start_time = datetime(2020, 1, 1)  # Arbitrary start date and time
        timestamps = [start_time + timedelta(seconds=2/1000) * i for i in range(num_samples)]

        # Add the timestamp column
        df['date'] = timestamps
        df_list.append(df)

    final_df = pd.concat(df_list, ignore_index=True)
    final_df.to_csv(output_csv_file, index=False)

# Example usage
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'  # Replace with your directory path
output_csv_file = '/home/noam.koren/multiTS/NFT/data/ecg/add_freq.csv'
concat_pkl_to_csv_with_timestamp_and_rename(directory, output_csv_file)


In [3]:
import pandas as pd
import os
from datetime import datetime

def concat_pkl_to_csv_with_date(directory, output_csv_file):
    """
    Reads all .pkl files in the given directory, adds a date column to each, and concatenates them into one .csv file.

    :param directory: The directory containing .pkl files.
    :param output_csv_file: The path to the output .csv file.
    """
    pkl_files = [file for file in os.listdir(directory) if file.endswith('.pkl')]
    df_list = []

    for file in pkl_files:
        file_path = os.path.join(directory, file)

        # Get the file's modification time and convert it to a date
        mod_time = os.path.getmtime(file_path)
        date_stamp = datetime.fromtimestamp(mod_time).date()

        df = pd.read_pickle(file_path)
        # Add a date column with the date_stamp
        df['date'] = date_stamp
        df_list.append(df)
        # Rename the first column to "OT"
        first_column_name = df.columns[0]
        df.rename(columns={first_column_name: 'OT'}, inplace=True)

    final_df = pd.concat(df_list, ignore_index=True)
    final_df.to_csv(output_csv_file, index=False)

# Example usage
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'  # Replace with your directory path
output_csv_file = '/home/noam.koren/multiTS/NFT/data/ecg/concated_600.csv'
concat_pkl_to_csv_with_date(directory, output_csv_file)


In [6]:
import pandas as pd
import os
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

def concat_pkl_to_csv_with_date_and_preprocessing(directory, output_csv_file):
    """
    Reads all .pkl files in the given directory, adds a date column to each, imputes missing values,
    normalizes the data, and concatenates them into one .csv file.

    :param directory: The directory containing .pkl files.
    :param output_csv_file: The path to the output .csv file.
    """
    pkl_files = [file for file in os.listdir(directory) if file.endswith('.pkl')]
    df_list = []

    # Initialize the imputer and scaler
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    scaler = StandardScaler()

    for file in pkl_files:
        file_path = os.path.join(directory, file)

        # Get the file's modification time and convert it to a date
        mod_time = os.path.getmtime(file_path)
        date_stamp = datetime.fromtimestamp(mod_time).date()

        df = pd.read_pickle(file_path)

        # Assuming '-' represents missing values; replace with NaN
        df.replace('-', np.nan, inplace=True)

        # Impute missing values
        df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

        # Normalize the data, except for the date column
        df_imputed.loc[:, df_imputed.columns != 'date'] = scaler.fit_transform(df_imputed.loc[:, df_imputed.columns != 'date'])

        # Add a date column with the date_stamp
        df_imputed['date'] = date_stamp

        # Rename the first column to "OT"
        first_column_name = df_imputed.columns[0]
        df_imputed.rename(columns={first_column_name: 'OT'}, inplace=True)

        df_list.append(df_imputed)

    final_df = pd.concat(df_list, ignore_index=True)
    final_df.to_csv(output_csv_file, index=False)

# Example usage
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'  # Replace with your directory path
output_csv_file = '/home/noam.koren/multiTS/NFT/data/ecg/concated_600.csv'
concat_pkl_to_csv_with_date_and_preprocessing(directory, output_csv_file)


In [1]:
import pandas as pd
import os
from datetime import datetime

def process_files(files, n_points):
    """Process a list of files to create a single DataFrame."""
    dfs = []
    for file in files:
        # Load the pickle file
        df = pd.read_pickle(file)
        
        # Take the first n_points
        # df = df.head(n_points)
        
        # Rename the first column to 'OT'
        first_column = df.columns[0]
        df.rename(columns={first_column: 'OT'}, inplace=True)
        
        # Add a 'date' column with the current date
        df['date'] = datetime.now().strftime('%Y-%m-%d')
        
        # Append to the list of DataFrames
        dfs.append(df)
        
    # Concatenate all DataFrames
    return pd.concat(dfs, ignore_index=True)

def main(directory, output_pkl_file, n_points):
    # Get all .pkl files sorted
    all_files = sorted([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pkl')])
    
    # Split into train, validation, and test sets
    train_files = all_files[:420]
    val_files = all_files[420:420+60]
    test_files = all_files[420+60:600]
    
    # Process each set
    train_df = process_files(train_files, n_points)
    val_df = process_files(val_files, n_points)
    test_df = process_files(test_files, n_points)
    
    # Save to new pickle files
    train_df.to_pickle(os.path.join(output_pkl_file, 'train.pkl'))
    val_df.to_pickle(os.path.join(output_pkl_file, 'validation.pkl'))
    test_df.to_pickle(os.path.join(output_pkl_file, 'test.pkl'))

# Example usage
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'
output_pkl_file = '/home/noam.koren/multiTS/NFT/data/ecg/transformer_ecg_data'
n_points = 2000
main(directory, output_pkl_file, n_points)


In [2]:
import pandas as pd
import os
import numpy as np
from datetime import datetime

def process_files(files, n_points):
    """Process a list of files to create a single DataFrame with imputed values."""
    dfs = []
    for file in files:
        # Load the pickle file
        df = pd.read_pickle(file)
        
        # Take the first n_points
        if n_points is not None:
            df = df.head(n_points)
        
        # Rename the first column to 'OT'
        first_column = df.columns[0]
        df.rename(columns={first_column: 'OT'}, inplace=True)
        
        # Add a 'date' column with the current date
        df['date'] = datetime.now().strftime('%Y-%m-%d')
        
        # Impute missing values
        # Here we use median for the example, adjust according to your needs
        # Assuming '-' represents missing values; replace with NaN
        df.replace('-', np.nan, inplace=True)
        df.fillna(df.median(), inplace=True)
        
        # Append to the list of DataFrames
        dfs.append(df)
        
    # Concatenate all DataFrames
    return pd.concat(dfs, ignore_index=True)

def main(directory, output_pkl_file, n_files, n_points):
    # Get all .pkl files sorted
    all_files = sorted([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pkl')])
    
    # Split into train, validation, and test sets
    n_train_files, n_vali_files = int(0.7*n_files), int(0.1*n_files)
    train_files = all_files[:n_train_files]
    val_files = all_files[n_train_files:n_train_files+n_vali_files]
    test_files = all_files[n_train_files+n_vali_files:]
    
    # Process each set
    train_df = process_files(train_files, n_points)
    val_df = process_files(val_files, n_points)
    test_df = process_files(test_files, n_points)
    
    # Save to new pickle files
    train_df.to_pickle(os.path.join(output_pkl_file, 'train.pkl'))
    val_df.to_pickle(os.path.join(output_pkl_file, 'validation.pkl'))
    test_df.to_pickle(os.path.join(output_pkl_file, 'test.pkl'))

# Adjust the paths according to your setup
directory = '/home/noam.koren/multiTS/NFT/data/ecg/pkl_files'
output_pkl_file = '/home/noam.koren/multiTS/NFT/data/ecg/transformer_ecg_data'
n_points = None
n_files = 300
main(directory, output_pkl_file, n_files, n_points)


  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), 