In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:

def save_to_excel(df, file_path):
    # Ensure the file path ends with .xlsx
    if not file_path.endswith('.xlsx'):
        file_path += '.xlsx'
    
    if os.path.exists(file_path):
        try:
            existing_df = pd.read_excel(file_path)
            combined_df = pd.concat([existing_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error reading existing file: {e}")
            print("Creating a new file instead.")
            combined_df = df
    else:
        combined_df = df
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='w') as writer:
        combined_df.to_excel(writer, index=False)

    print(f"Data successfully saved to {file_path}")

def create_seasonality_series(n_variables=n_variables, n_timesteps=n_timesteps, seasonality_period=50, noise_level=0.1, save_data=False):
        
    # Create time vector
    time = np.arange(n_timesteps)

    # Generate synthetic multivariate time series data
    data = np.zeros((n_timesteps, n_variables))
    for i in range(n_variables):
        # Add high seasonality with different phases and amplitudes
        amplitude = np.random.uniform(0.5, 1.5)
        phase = np.random.uniform(0, 2 * np.pi)
        data[:, i] = amplitude * np.sin(2 * np.pi * time / seasonality_period + phase)

        # Add some noise
        data[:, i] += np.random.normal(0, noise_level, n_timesteps)

    # Convert to a pandas DataFrame for visualization and manipulation
    columns = [f"Variable_{i+1}" for i in range(n_variables)]
    synthetic_data = pd.DataFrame(data, columns=columns)

    # Plot the generated time series data
    synthetic_data.plot(figsize=(12, 6))
    plt.title("Synthetic Multivariate Time Series with High Seasonality")
    plt.xlabel("Time")
    plt.ylabel("Values")
    plt.grid()
    plt.show()

    if save_data:
        directory = '/home/noam.koren/multiTS/NFT/data/seasonality/'
        file_path = os.path.join(directory, 'seasonality.pkl')
        os.makedirs(directory, exist_ok=True)
        synthetic_data.to_pickle(file_path)

def process_component_info(seasonality_trend_info, series_name):
    component_info = pd.DataFrame(seasonality_trend_info)
    component_info['Series_Name'] = series_name

    averages = component_info.select_dtypes(include='number').mean()
    averages_row = {col: averages[col] if col in averages.index else None for col in component_info.columns}
    averages_row['Series_Name'] = series_name
    averages_row['Series'] = 'Average'
    averages_row['Seasonality_Precetage'] = averages_row['Seasonality_Dominance'] / (averages_row['Seasonality_Dominance']  + averages_row['Trend_Dominance'])
    averages_df = pd.DataFrame([averages_row])

    component_info = pd.concat([component_info, averages_df], ignore_index=True)

    cols = ['Series_Name'] + [col for col in component_info.columns if col != 'Series_Name']
    component_info = component_info[cols]

    print("Averaged Component Info:")
    print(averages_df[cols].head(6))

    return component_info, averages_df[cols]

def create_time_series(
    num_series, 
    num_points,
    trend_amplitude=0.5, 
    noise_level=1, 
    seasonal_amplitude=1, # (0.5, 1.5)
    save_data=False,
    series_name=None,
    compute_components=True  # parameter to compute seasonality and trend proportions
    ):
    time = np.arange(num_points)
    data = {}
    seasonality_trend_info = []

    for i in range(num_series):
        trend = trend_amplitude * (time / num_points)
        
        phase = np.random.uniform(0, 2 * np.pi)
        seasonal = (seasonal_amplitude * (i+1)) * np.sin(2 * np.pi * time / (50 + i * 5) + i * phase)
    
        noise = np.random.normal(0, noise_level, num_points)
        
        series = trend + seasonal + noise
        data[f"Series_{i+1}"] = series
        
        if compute_components:
            total_variance = np.var(series)
            seasonal_variance = np.var(seasonal)
            trend_variance = np.var(trend)
            noise_variance = np.var(noise)
        

            seasonality_ratio = seasonal_variance / total_variance
            trend_ratio = trend_variance / total_variance
            trend_to_seasonality_ratio = trend_variance / seasonal_variance if seasonal_variance > 0 else np.inf
            seasonality_to_trend_ratio = seasonal_variance / trend_variance if trend_variance > 0 else np.inf
            noise_ratio = noise_variance / total_variance
            
            trend_dominance = max(0, 1 - (noise_variance / (noise_variance + trend_variance)))
            seasonality_dominance = max(0, 1 - (noise_variance / (noise_variance + seasonal_variance)))

            seasonality_trend_info.append({
                'Series': f"Series_{i+1}",
                'Seasonality_Dominance': seasonality_dominance,
                'Trend_Dominance': trend_dominance,
                'Seasonality_Ratio': seasonality_ratio,
                'Trend_Ratio': trend_ratio,
                'Trend_to_Seasonality_Ratio': trend_to_seasonality_ratio,
                'Seasonality_to_Trend_Ratio': seasonality_to_trend_ratio,
                'Noise_Ratio': noise_ratio
            })

    synthetic_data = pd.DataFrame(data, index=time)
    # synthetic_data.plot(figsize=(10, 6), title="Synthetic Multivariate Time Series")
    # plt.xlabel("Time")
    # plt.ylabel("Value")
    # plt.show()
    
    component_info = process_component_info(seasonality_trend_info, series_name=series_name)

    
    if save_data:
        directory = f'/home/noam.koren/multiTS/NFT/data/{series_name}/'
        file_path = os.path.join(directory, f'{series_name}.pkl')
        os.makedirs(directory, exist_ok=True)
        synthetic_data.to_pickle(file_path)
        
        save_to_excel(component_info, '/home/noam.koren/multiTS/NFT/models/tests/analyse_data/data_components.xlsx')

n_variables=5
n_timesteps=1000
  
# for seasonal_amplitude in [4, 5, 6, 7, 8, 9, 10]:
#     for trend_amplitude in [0.50]:
#         # create_time_series(
#         #     num_series=n_variables, 
#         #     num_points=n_timesteps,
#         #     trend_amplitude=trend_amplitude, 
#         #     noise_level=0.1, 
#         #     seasonal_amplitude=seasonal_amplitude,
#         #     save_data=True,
#         #     series_name=f'seasonal_{seasonal_amplitude}_trend_{trend_amplitude}'
#         #     )
         

In [21]:
def get_components(y):

    # Step 2: Apply Fast Fourier Transform (FFT)
    n = len(y)  # Length of the signal
    fft_vals = np.fft.fft(y)  # FFT
    frequencies = np.fft.fftfreq(n)  # Frequency bins

    # Step 3: Identify dominant frequencies (filtering)
    # Sort by the magnitude of the FFT values and retain top frequencies
    magnitude = np.abs(fft_vals)
    threshold = 0.05 * max(magnitude)  # Keep frequencies with > 5% of max amplitude
    filtered_fft_vals = fft_vals.copy()
    filtered_fft_vals[magnitude < threshold] = 0  # Zero out low-amplitude components

    # Step 4: Reconstruct the seasonality using Inverse FFT (IFFT)
    seasonality = np.fft.ifft(filtered_fft_vals).real
    
    # Step 5: Filter out high frequencies (low-pass filter)
    # Define a cutoff frequency: retain only the low frequencies
    cutoff = 0.05  # Adjust this value based on your data (0 < cutoff < 1)
    filtered_fft_vals = fft_vals.copy()
    filtered_fft_vals[np.abs(frequencies) > cutoff] = 0  # Zero out high-frequency components

    # Step 4: Reconstruct the trend using Inverse FFT (IFFT)
    trend = np.fft.ifft(filtered_fft_vals).real
    
    noise = y - seasonality - trend
    
    return seasonality, trend, noise
 
def plot_componentes(y, seasonality, trend, noise):
    plt.figure(figsize=(12, 6))
    plt.plot(y, label='Original Series', color='blue')
    plt.plot(seasonality, label='Extracted Seasonality', color='red')
    plt.plot(trend, label='Extracted Trend', color='orange')
    plt.plot(noise, label='Extracted Noise', color='green')
    plt.title('Components Extraction using Fourier Transform')
    plt.legend()
    plt.show()
 
def calculate_components_dominance(data, data_name, save_Excel=False):
    seasonality_trend_info = []
    for column in data.columns:
        y = data[column]
            
        seasonal, trend, noise = get_components(y)

        total_variance = y.var()
        seasonal_variance = seasonal.var()
        trend_variance = trend.var()
        noise_variance = noise.var()

        seasonality_dominance = max(0, 1 - (noise_variance / (noise_variance + seasonal_variance)))
        trend_dominance = max(0, 1 - (noise_variance / (noise_variance + trend_variance)))
        seasonality_precetage = seasonality_dominance / (seasonality_dominance + trend_dominance) if (seasonality_dominance + trend_dominance) != 0 else 0
        
        print(seasonality_dominance, trend_dominance, seasonality_precetage)
        
        seasonality_trend_info.append({
            'Series': column,
            'Seasonality_Dominance': seasonality_dominance,
            'Trend_Dominance': trend_dominance,
            'Seasonality_Precetage': seasonality_precetage,
            'total_variance': total_variance,
            'seasonal_variance': seasonal_variance,
            'trend_variance': trend_variance,
            'noise_variance': noise_variance,
            })
        
    component_info, averages_df = process_component_info(seasonality_trend_info, series_name=data_name)
    if save_Excel:
        save_to_excel(averages_df, f'/home/noam.koren/multiTS/NFT/models/tests/analyse_data/real_data_components.xlsx')

    return component_info, averages_df

# pkl_dir = "/home/noam.koren/multiTS/NFT/data/ecg/pkl_files/"
# for pkl_file in os.listdir(pkl_dir):
#     if pkl_file.endswith(".pkl"):  # Check if the file is a .pkl file
#         file_path = os.path.join(pkl_dir, pkl_file)  # Full path to the file
#         data = pd.read_pickle(file_path)
#         component_info, averages_df = calculate_components_dominance(data, pkl_file[:-4], save_Excel=True)

# for dataset in ['electricity', 'exchange', 'illness', 'traffic']:
#     data = pd.read_pickle(f'/home/noam.koren/multiTS/NFT/data/{dataset}/{dataset}_no_date.pkl')
#     component_info, averages_df = calculate_components_dominance(data, dataset, save_Excel=True)


data = pd.read_pickle('/home/noam.koren/multiTS/NFT/data/electricity/mini_elctricity.pkl')
component_info, averages_df = calculate_components_dominance(data, dataset, save_Excel=True)

# for n in ['AE000041196', 'AEM00041194', 'AEM00041217']:
#     data = pd.read_pickle(f'/home/noam.koren/multiTS/NFT/data/noaa/noaa_ghcn/noaa_pkl/{n}.pkl')
#     component_info, averages_df = calculate_components_dominance(data, n, save_Excel=True)

# pkl_dir = "/home/noam.koren/multiTS/NFT/data/noaa/noaa_ghcn/years/embedded/AEM00041217/"
# for pkl_file in os.listdir(pkl_dir):
#     if pkl_file.endswith(".pkl"):  # Check if the file is a .pkl file
#         file_path = os.path.join(pkl_dir, pkl_file)  # Full path to the file
#         data = pd.read_pickle(file_path)
#         component_info, averages_df = calculate_components_dominance(data, pkl_file[:-4], save_Excel=True)
        
# csv_dir = '/home/noam.koren/multiTS/NFT/data/chorales/chorales_csvs'
# for csv_file in os.listdir(csv_dir):
#     if csv_file.endswith(".csv"):  # Check if the file is a .pkl file
#         file_path = os.path.join(csv_dir, csv_file)  # Full path to the file
#         data = pd.read_csv(file_path)
#         component_info, averages_df = calculate_components_dominance(data, csv_file[:-4], save_Excel=True)

0.6700328749294397 0.48294455439953865 0.5811326899255888
0.6502333031670686 0.4724909227162899 0.5791567405214452
0.5231599939667442 0.38765877440337126 0.5743842926106192
0.7342898078158848 0.5798748395248386 0.5587502367391758
0.6000845445516669 0.5141361008621911 0.5385688615819678
0.6151335306721881 0.5136674407678737 0.5449441896629792
0.5450501129949544 0.48267332843129074 0.5303470671434227
0.5508879870299195 0.5175580792723631 0.515597374920807
0.5566806945241101 0.5102677561264948 0.5217503190380536
0.5089783068591609 0.4905006394291781 0.509243650153213
0.5162333656083318 0.4972897729160535 0.5093454169777805
0.4929739447565967 0.500620748749596 0.4961519500642585
0.4847735186754468 0.4886229056813002 0.4980227033356953
0.4846313975190054 0.5014236933636711 0.49148511274880496
0.5308772509608652 0.5543197593064286 0.4891989619747527
0.4297391717667568 0.468692271242498 0.4783216071861479
0.5026610586240976 0.5433308498612615 0.48055922282608504
0.4777681166355363 0.534984973

In [22]:
pkl_path = '/home/noam.koren/multiTS/NFT/data/electricity/electricity_no_date.pkl'
d = pd.read_pickle(pkl_path)
column_indices_to_extract = [20, 44, 92, 64, 58, 71, 47, 13, 21, 22, 3, 5, 10, 16, 7, 15, 2, 69, 4, 6, 1, 29, 0, 8, 25, 55, 122, 125, 131, 99, 84, 9, 50, 113, 117, 118]
mini_elctricity = d.iloc[:, column_indices_to_extract]
mini_elctricity.to_pickle('/home/noam.koren/multiTS/NFT/data/electricity/mini_elctricity.pkl')




In [13]:
print(d.columns)
d['0', '1']

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '311', '312', '313', '314', '315', '316', '317', '318', '319', 'OT'],
      dtype='object', length=321)


KeyError: ('0', '1')

In [None]:
# import os
# import pandas as pd

# csv_dir = '/home/noam.koren/multiTS/NFT/data/chorales/chorales_csvs'

# data_names = []
# data_lengths = []

# for csv_file in os.listdir(csv_dir):
#     if csv_file.endswith(".csv"):  # Check if the file is a .csv file
#         file_path = os.path.join(csv_dir, csv_file)  # Full path to the file
#         data = pd.read_csv(file_path)
#         data_names.append(csv_file[:-4])  # Remove .csv extension
#         data_lengths.append(len(data))  # Get length of the data

# output_df = pd.DataFrame({
#     'Data Name': data_names,
#     'Data Length': data_lengths
# })

# output_excel_path = '/home/noam.koren/multiTS/NFT/data/chorales/chorales_data_summary.xlsx'
# output_df.to_excel(output_excel_path, index=False)

# output_excel_path


'/home/noam.koren/multiTS/NFT/data/chorales/chorales_data_summary.xlsx'