In [2]:
import pandas as pd
import os
from IPython.display import display

# Define folder path (adjust if needed)
folder_path = '.'

# Get a list of all CSV files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Separate files into baseline and frequency groups
baseline_files = [f for f in file_list if 'baseline' in f.lower()]
frequency_files = [f for f in file_list if 'frequency' in f.lower()]

# List of selected sensor IDs
sensor_ids = ['power', '12', '11','28', '26','27','29', 'tvoc', 'pm2p5', 'pm10p0', 'dewpoint']

# Function to process each file and compute mean & std values
def process_file(file_path):
    """Reads and processes a CSV file, computing mean and std for selected sensors."""
    data = pd.read_csv(file_path)

    # Ensure required columns exist
    if not {'sensor_id', 'value', 'time'}.issubset(data.columns):
        print(f"Error: Required columns missing in {file_path}")
        return None

    # Convert 'time' column to datetime and drop invalid values
    data['time'] = pd.to_datetime(data['time'], errors='coerce')
    data.dropna(subset=['time'], inplace=True)

    # Compute mean and std for selected sensors
    stats = data[data['sensor_id'].isin(sensor_ids)].groupby('sensor_id')['value'].agg(['mean', 'std']).round(2)
    stats['mean (std)'] = stats.apply(lambda x: f"{x['mean']} ({x['std']})", axis=1)
    return stats[['mean (std)']].reset_index()

# Process all files and store results
all_means = []
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    mean_values = process_file(file_path)
    if mean_values is not None:
        mean_values['file_name'] = file_name  # Track file source
        all_means.append(mean_values)

# Combine all mean values into a single DataFrame
if all_means:
    all_means_df = pd.concat(all_means).reset_index(drop=True)

    # Split into baseline and frequency groups
    baseline_df = all_means_df[all_means_df['file_name'].isin(baseline_files)]
    frequency_df = all_means_df[all_means_df['file_name'].isin(frequency_files)]

# Load and combine all sensor data
def load_combined_data(files):
    """Loads and combines sensor data from multiple files."""
    data_frames = [pd.read_csv(os.path.join(folder_path, f)) for f in files if os.path.exists(os.path.join(folder_path, f))]
    return pd.concat(data_frames, ignore_index=True) if data_frames else pd.DataFrame()

all_baseline_combined = load_combined_data(baseline_files)
all_frequency_combined = load_combined_data(frequency_files)

# Compute overall mean and std for baseline and frequency groups
if not all_baseline_combined.empty:
    overall_baseline_stats = all_baseline_combined[all_baseline_combined['sensor_id'].isin(sensor_ids)].groupby('sensor_id')['value'].agg(['mean', 'std']).round(2)
    overall_baseline_stats['mean (std)'] = overall_baseline_stats.apply(lambda x: f"{x['mean']} ({x['std']})", axis=1)
    print("\nOverall Baseline Stats:")
    display(overall_baseline_stats[['mean (std)']])

if not all_frequency_combined.empty:
    overall_frequency_stats = all_frequency_combined[all_frequency_combined['sensor_id'].isin(sensor_ids)].groupby('sensor_id')['value'].agg(['mean', 'std']).round(2)
    overall_frequency_stats['mean (std)'] = overall_frequency_stats.apply(lambda x: f"{x['mean']} ({x['std']})", axis=1)
    print("\nOverall Frequency Stats:")
    display(overall_frequency_stats[['mean (std)']])

# Extract sensor data recorded after 50 minutes
def extract_after_50_minutes(files):
    """Extracts sensor data recorded after 50 minutes from the first timestamp in each file."""
    filtered_entries = []
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        data = pd.read_csv(file_path)
        if not {'sensor_id', 'value', 'time'}.issubset(data.columns):
            continue

        # Convert time column to datetime
        data['time'] = pd.to_datetime(data['time'], errors='coerce')
        data.dropna(subset=['time'], inplace=True)
        
        # Determine threshold time (first timestamp + 50 minutes)
        if not data.empty:
            start_time = data['time'].min()
            threshold_time = start_time + pd.Timedelta(minutes=50)
            data_filtered = data[(data['sensor_id'].isin(sensor_ids)) & (data['time'] >= threshold_time)].copy()
            data_filtered['file_name'] = file_name
            filtered_entries.append(data_filtered)
    
    return pd.concat(filtered_entries, ignore_index=True) if filtered_entries else pd.DataFrame()

# Compute sensor data recorded after 50 minutes for both groups
baseline_after_50_df = extract_after_50_minutes(baseline_files)
frequency_after_50_df = extract_after_50_minutes(frequency_files)

# Compute mean and std of after 50 mins values
def compute_last_stats(df, group_name):
    """Computes mean and standard deviation of after 50 mins values."""
    if not df.empty:
        stats = df.groupby('sensor_id')['value'].agg(['mean', 'std']).round(2)
        stats['mean (std)'] = stats.apply(lambda x: f"{x['mean']} ({x['std']})", axis=1)
        print(f"\nMean and Standard Deviation of After 50 Minutes Data for {group_name}:")
        display(stats[['mean (std)']])

compute_last_stats(baseline_after_50_df, "Baseline")
compute_last_stats(frequency_after_50_df, "Frequency")



Overall Baseline Stats:


Unnamed: 0_level_0,mean (std)
sensor_id,Unnamed: 1_level_1
11,25.64 (0.99)
12,24.93 (0.52)
26,21.08 (0.19)
27,35.48 (7.06)
28,1019.66 (3.74)
29,685.67 (95.47)
dewpoint,5.62 (3.01)
pm10p0,0.86 (0.45)
pm2p5,0.73 (0.36)
power,1106.6 (17.28)



Overall Frequency Stats:


Unnamed: 0_level_0,mean (std)
sensor_id,Unnamed: 1_level_1
11,16.87 (3.36)
12,16.36 (3.69)
26,21.1 (0.2)
27,35.76 (7.03)
28,1019.59 (3.47)
29,781.25 (174.52)
dewpoint,5.44 (3.12)
pm10p0,0.83 (0.39)
pm2p5,0.74 (0.39)
power,450.91 (255.7)



Mean and Standard Deviation of After 50 Minutes Data for Baseline:


Unnamed: 0_level_0,mean (std)
sensor_id,Unnamed: 1_level_1
11,25.68 (0.87)
12,24.83 (0.4)
26,21.11 (0.09)
27,36.1 (7.17)
28,1019.32 (3.58)
29,772.68 (34.87)
dewpoint,5.62 (3.01)
pm10p0,0.86 (0.45)
pm2p5,0.73 (0.36)
power,1106.45 (12.96)



Mean and Standard Deviation of After 50 Minutes Data for Frequency:


Unnamed: 0_level_0,mean (std)
sensor_id,Unnamed: 1_level_1
11,20.28 (2.45)
12,19.96 (2.93)
26,21.11 (0.13)
27,36.84 (7.0)
28,1019.56 (3.59)
29,983.24 (64.43)
dewpoint,5.44 (3.12)
pm10p0,0.83 (0.39)
pm2p5,0.74 (0.39)
power,685.17 (263.73)
