## Input the folder with the csv already labeled, output windows for LC and FR with corresponding layers and balanced.

In [1]:
import pandas as pd
import os
import random

# Read all files from the outputs folder
folder_path_training = "training"
folder_path_validation = "validation"
folder_path_testing = "testing"

files_training = sorted(os.listdir(folder_path_training))
files_validation = sorted(os.listdir(folder_path_validation))
files_testing = sorted(os.listdir(folder_path_testing))

In [2]:
window_size = 50 # N/10 seconds

In [None]:
# Initialize empty lists for all windows and labels    
all_x_lc_training = []
all_y_lc_training = []
all_x_fr_training = []
all_y_fr_training = []    
for file_name in files_training:
    print(f"Processing file: {file_name}")
    file_path = os.path.join(folder_path_training, file_name)
    
    # Read the current file
    df = pd.read_csv(file_path)
    
    #drop filetime
    df = df.drop(columns=['fileTime'])
    
    # Fill NaN values with -1
    df = df.fillna(-1)
    
    # Initialize lists for current file's windows and labels
    x_lc = []
    y_lc = []
    x_fr = []

    # Create windows of size 50 for timesteps where the last label is < 4.1
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window
        
        # Check if the last label in the window is < 4.1
        if last_label != "-":
            if float(last_label) < 4.1:
                x_lc.append(window.drop(columns=['labels']).values)  # Add the window without the labels column
                y_lc.append(float(last_label))  # Add the last label of the window

    # Append current file's windows and labels to the overall lists
    all_x_lc_training.extend(x_lc)
    all_y_lc_training.extend(y_lc)
    
    # Step 4: Count total x_lc windows
    num_x_lc = len(x_lc)
    print(f"Total number of x_lc windows: {num_x_lc}")
    
    # Step 5: Create x_fr windows
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 10
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window

        # Check if the last label in the window is 4.1 and the window does not contain "-"
        if last_label == "4.1" and "-" not in window['labels'].values:
            x_fr.append(window.drop(columns=['labels']).values)  # Add the window without the labels column

    # Step 6: Balance classes by randomly selecting the same number of windows for x_fr
    if len(x_fr) >= num_x_lc:
        x_fr_balanced = random.sample(x_fr, num_x_lc)  # Randomly pick num_x_lc windows from x_fr
    else:
        print(f"Warning: Not enough x_fr windows to balance classes. Found only {len(x_fr)} windows.")
        x_fr_balanced = x_fr  # Use all available x_fr windows
    
    # Append current file's windows to the overall list
    all_x_fr_training.extend(x_fr_balanced)
    all_y_fr_training.extend([4.1] * len(x_fr_balanced))  # Add the label 4.1 to all x_fr windows



all_x_lc_validation= []
all_y_lc_validation = []
all_x_fr_validation = []
all_y_fr_validation = []

for file_name in files_validation:
    print(f"Processing file: {file_name}")
    file_path = os.path.join(folder_path_validation, file_name)
    
    # Read the current file
    df = pd.read_csv(file_path)
    
    #drop filetime
    df = df.drop(columns=['fileTime'])
    
    # Fill NaN values with -1
    df = df.fillna(-1)
    
    # Initialize lists for current file's windows and labels
    x_lc = []
    y_lc = []
    x_fr = []

    # Create windows of size 10 for timesteps where the last label is < 4.1
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 10
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window
        
        # Check if the last label in the window is < 4.1
        if last_label != "-":
            if float(last_label) < 4.1:
                x_lc.append(window.drop(columns=['labels']).values)  # Add the window without the labels column
                y_lc.append(float(last_label))  # Add the last label of the window

    # Append current file's windows and labels to the overall lists
    all_x_lc_validation.extend(x_lc)
    all_y_lc_validation.extend(y_lc)
    
    # Step 4: Count total x_lc windows
    num_x_lc = len(x_lc)
    print(f"Total number of x_lc windows: {num_x_lc}")
    
    # Step 5: Create x_fr windows
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 10
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window

        # Check if the last label in the window is 4.1 and the window does not contain "-"
        if last_label == "4.1" and "-" not in window['labels'].values:
            x_fr.append(window.drop(columns=['labels']).values)  # Add the window without the labels column

    # Step 6: Balance classes by randomly selecting the same number of windows for x_fr
    if len(x_fr) >= num_x_lc:
        x_fr_balanced = random.sample(x_fr, num_x_lc)  # Randomly pick num_x_lc windows from x_fr
    else:
        print(f"Warning: Not enough x_fr windows to balance classes. Found only {len(x_fr)} windows.")
        x_fr_balanced = x_fr  # Use all available x_fr windows
    
    # Append current file's windows to the overall list
    all_x_fr_validation.extend(x_fr_balanced)
    all_y_fr_validation.extend([4.1] * len(x_fr_balanced))  # Add the label 4.1 to all x_fr windows
        
        
        
all_x_lc_testing= []
all_y_lc_testing = []
all_x_fr_testing = []
all_y_fr_testing = []

# Process each file in the folder
for file_name in files_testing:
    print(f"Processing file: {file_name}")
    file_path = os.path.join(folder_path_testing, file_name)
    
    # Read the current file
    df = pd.read_csv(file_path)
    
    #drop filetime
    df = df.drop(columns=['fileTime'])
    
    # Fill NaN values with -1
    df = df.fillna(-1)
    
    # Initialize lists for current file's windows and labels
    x_lc = []
    y_lc = []
    x_fr = []

    # Create windows of size 10 for timesteps where the last label is < 4.1
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 10
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window
        
        # Check if the last label in the window is < 4.1
        if last_label != "-":
            if float(last_label) < 4.1:
                x_lc.append(window.drop(columns=['labels']).values)  # Add the window without the labels column
                y_lc.append(float(last_label))  # Add the last label of the window

    # Append current file's windows and labels to the overall lists
    all_x_lc_testing.extend(x_lc)
    all_y_lc_testing.extend(y_lc)
    
    # Step 4: Count total x_lc windows
    num_x_lc = len(x_lc)
    print(f"Total number of x_lc windows: {num_x_lc}")
    
    # Step 5: Create x_fr windows
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size]  # Get the current window of size 10
        last_label = window.iloc[-1]['labels']  # Get the label of the last row in the window

        # Check if the last label in the window is 4.1 and the window does not contain "-"
        if last_label == "4.1" and "-" not in window['labels'].values:
            x_fr.append(window.drop(columns=['labels']).values)  # Add the window without the labels column

    # Step 6: Balance classes by randomly selecting the same number of windows for x_fr
    if len(x_fr) >= num_x_lc:
        x_fr_balanced = random.sample(x_fr, num_x_lc)  # Randomly pick num_x_lc windows from x_fr
    else:
        print(f"Warning: Not enough x_fr windows to balance classes. Found only {len(x_fr)} windows.")
        x_fr_balanced = x_fr  # Use all available x_fr windows
    
    # Append current file's windows to the overall list
    all_x_fr_testing.extend(x_fr_balanced)
    all_y_fr_testing.extend([4.1] * len(x_fr_balanced))  # Add the label 4.1 to all x_fr windows

Processing file: user11LC.csv
Total number of x_lc windows: 1541
Processing file: user14LC.csv
Total number of x_lc windows: 2440
Processing file: user15LC.csv
Total number of x_lc windows: 2091
Processing file: user17LC.csv
Total number of x_lc windows: 3000
Processing file: user1LC.csv
Total number of x_lc windows: 2225
Processing file: user20LC.csv
Total number of x_lc windows: 2672
Processing file: user21LC.csv
Total number of x_lc windows: 2355
Processing file: user22LC.csv
Total number of x_lc windows: 2635
Processing file: user23LC.csv
Total number of x_lc windows: 2094
Processing file: user24LC.csv
Total number of x_lc windows: 2094
Processing file: user26LC.csv
Total number of x_lc windows: 1072
Processing file: user28LC.csv
Total number of x_lc windows: 2435
Processing file: user29LC.csv
Total number of x_lc windows: 2863
Processing file: user30LC.csv
Total number of x_lc windows: 2412
Processing file: user32LC.csv
Total number of x_lc windows: 2810
Processing file: user33LC.

In [4]:
#save the data
folder_data = "data"
if not os.path.exists(folder_data):
    os.makedirs(folder_data) 
import pickle
with open(folder_data+'/x_lc_training'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_lc_training, f)
with open(folder_data+'/y_lc_training'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_lc_training, f)
    
with open(folder_data+'/x_fr_training'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_fr_training, f)
with open(folder_data+'/y_fr_training'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_fr_training, f)
    
#save the data
with open(folder_data+'/x_lc_validation'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_lc_validation, f)
with open(folder_data+'/y_lc_validation'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_lc_validation, f)
    
with open(folder_data+'/x_fr_validation'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_fr_validation, f)
with open(folder_data+'/y_fr_validation'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_fr_validation, f)

#save the data
with open(folder_data+'/x_lc_testing'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_lc_testing, f)
with open(folder_data+'/y_lc_testing'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_lc_testing, f)
    
with open(folder_data+'/x_fr_testing'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_x_fr_testing, f)
with open(folder_data+'/y_fr_testing'+str(window_size)+'.pkl', 'wb') as f:
    pickle.dump(all_y_fr_testing, f)