In [15]:
import numpy as np
import pandas as pd

REAL_DATA = '../Processed Data/Aruba_17/pre_processed_data.csv'
FAKE_DATA = '../Predictions/Aruba_17_completed_prediction_419.txt'

# Read the CSV file
df = pd.read_csv(REAL_DATA)

# Separate the Device_Status column
df[['Device ID', 'Device Status']] = df['Device_Status'].str.split('_', expand=True)
df.drop('Device_Status', axis=1, inplace=True)

# Add new columns for Activity and Activity Status
df['Activity Status'] = None

# Iterate through the rows and split the 'Activity' column at the last underscore
for index, row in df.iterrows():
    if pd.notna(row['Activity']) and '_' in row['Activity']:
        parts = row['Activity'].rsplit('_', 1)
        activity, status = parts[0], parts[1]
        df.at[index, 'Activity'] = activity
        df.at[index, 'Activity Status'] = status

# Rearrange the columns
df = df[['Date', 'Time', 'Device ID', 'Device Status', 'Activity', 'Activity Status']]

# Save the updated DataFrame to a new CSV file
df.to_csv('../Processed Data/Aruba_17/updated_processed_data.csv', index=False)


In [16]:

REAL_DATA = '../Processed Data/Aruba_17/updated_processed_data.csv'

# Read the data as a string and split it by lines
data_df = pd.read_csv(REAL_DATA)
fake_data_df = pd.read_csv(FAKE_DATA)

def activity_counts(data, activity_column):
    counts = data.iloc[:, activity_column].value_counts()
    counts = counts / 2
    return counts
def activity_proportions(counts):
    proportions = counts / counts.sum()
    return proportions
def activity_entropy(proportions):
    entropy = -np.sum(proportions * np.log2(proportions))
    return entropy

# print the 4 column of the data
# print(data_df.iloc[:, 4])
print(fake_data_df.iloc[:, 4])
real_counts = activity_counts(data_df, 4)
generated_counts = activity_counts(fake_data_df, 4)

real_proportions = activity_proportions(real_counts)
generated_proportions = activity_proportions(generated_counts)

real_entropy = activity_entropy(real_proportions)
generated_entropy = activity_entropy(generated_proportions)

comparison_df = pd.DataFrame(columns=["Metric", "Real Data", "Generated Data"])

comparison_data = []

comparison_data.append(
    {"Metric": "Entropy", "Real Data": real_entropy, "Generated Data": generated_entropy}
)

for activity in real_counts.index:
    comparison_data.append(
        {
            "Metric": f"Count ({activity})",
            "Real Data": real_counts[activity],
            "Generated Data": generated_counts.get(activity, 0),
        }
    )

for activity in real_proportions.index:
    comparison_data.append(
        {
            "Metric": f"Proportion ({activity})",
            "Real Data": real_proportions[activity],
            "Generated Data": generated_proportions.get(activity, 0),
        }
    )

comparison_df = pd.concat([comparison_df, pd.DataFrame(comparison_data)], ignore_index=True)

print(comparison_df)

0             Housekeeping
1               Leave_Home
2               Leave_Home
3               Leave_Home
4         Meal_Preparation
                ...       
432734          Leave_Home
432735          Leave_Home
432736          Leave_Home
432737          Leave_Home
432738          Leave_Home
Name: Activity, Length: 432739, dtype: object
                           Metric Real Data Generated Data
0                         Entropy  2.353971       2.002979
1                   Count (Relax)    2918.5        59522.0
2        Count (Meal_Preparation)    1605.5        86054.0
3              Count (Leave_Home)     431.0        28965.0
4              Count (Enter_Home)     431.0           46.0
5                Count (Sleeping)     401.0         5492.0
6                  Count (Eating)     257.0            8.0
7                    Count (Work)     171.0            0.0
8           Count (Bed_to_Toilet)     157.0            0.0
9             Count (Wash_Dishes)      65.0            0.0
10      