## Training - Test Sets

In [1]:
import gzip
import pandas as pd

# Read the exported for header chartevent CSV file
mean_df = pd.read_csv(r'CSV\Exports\datasets\whole_set\o01_final_mean_table.csv')
median_df = pd.read_csv(r'CSV\Exports\datasets\whole_set\o02_final_median_table.csv')
min_df = pd.read_csv(r'CSV\Exports\datasets\whole_set\o03_final_min_table.csv')
max_df = pd.read_csv(r'CSV\Exports\datasets\whole_set\o04_final_max_table.csv')

In [2]:
# Set training percentage. The difference goes to test set
training_percentage = 0.7

# Split Mean to Training - Test set 

In [3]:
# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
mean_df = mean_df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = mean_df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets with the same column structure and dtypes as mean_df
train_df = mean_df.iloc[0:0].copy()
test_df = mean_df.iloc[0:0].copy()

# Iterate through the sorted DataFrame
for subject_id, subject_data in mean_df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data], ignore_index=True)
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data], ignore_index=True)


# Reset the index of the resulting DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Export the merged DataFrame to a CSV file
train_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o01_mean_train_.csv', index=False)
test_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o01_mean_test_.csv', index=False)

# Display the last row of the training set
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

# Delete dataframes and variables to free memory
del (unique_subject_ids, train_subject_ids_count, subject_ids_in_training, subject_data, train_df, test_df)

'The last row of the training set is -> 40688'

# Split Median to Training - Test set 

In [4]:
# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
median_df = median_df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = median_df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets
train_df = median_df.iloc[0:0].copy()
test_df = median_df.iloc[0:0].copy()

# Iterate through the sorted DataFrame
for subject_id, subject_data in median_df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data], ignore_index=True)
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data], ignore_index=True)


# Export the merged DataFrame to a CSV file
train_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o02_median_train_.csv', index=False)
test_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o02_median_test_.csv', index=False)

# I'm going to use those numbers as the split point in rapidminer filter operator
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

# Delete dataframes and variables to free memory.
del (unique_subject_ids, train_subject_ids_count, subject_ids_in_training, subject_data, train_df, test_df)

'The last row of the training set is -> 40688'

# Split Min to Training - Test set 

In [5]:
# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
min_df = min_df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = min_df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets
train_df = min_df.iloc[0:0].copy()
test_df = min_df.iloc[0:0].copy()

# Iterate through the sorted DataFrame
for subject_id, subject_data in min_df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data], ignore_index=True)
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data], ignore_index=True)

# Export the merged DataFrame to a CSV file
train_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o03_min_train_.csv', index=False)
test_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o03_min_test_.csv', index=False)

# I'm going to use those numbers as the split point in rapidminer filter operator
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

# Delete dataframes and variables to free memory.
del (unique_subject_ids, train_subject_ids_count, subject_ids_in_training, subject_data, train_df, test_df)

'The last row of the training set is -> 40688'

# Split Max to Training - Test set 

In [6]:
# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
max_df = max_df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = max_df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets
train_df = max_df.iloc[0:0].copy()
test_df = max_df.iloc[0:0].copy()

# Iterate through the sorted DataFrame
for subject_id, subject_data in max_df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data], ignore_index=True)
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data], ignore_index=True)

# Export the merged DataFrame to a CSV file
train_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o04_max_train_.csv', index=False)
test_df.to_csv(r'CSV\Exports\datasets\Train_test_sets\o04_max_test_.csv', index=False)

# I'm going to use those numbers as the split point in rapidminer filter operator
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

# Delete dataframes and variables to free memory.
del (unique_subject_ids, train_subject_ids_count, subject_ids_in_training, subject_data, train_df, test_df)

'The last row of the training set is -> 40688'