In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import datetime
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr

def compare_time_window(csv_file, date_col='created_at'):
    try:
        df = pd.read_csv(csv_file)
        df['__datetime__'] = pd.to_datetime(df[date_col], utc=True, errors='coerce')
        df = df.dropna(subset=['__datetime__'])
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        print("No numeric fields found.")
        return
    
    # Widgets
    base_date = widgets.DatePicker(description='Base date:', value=df['__datetime__'].max().date())
    
    start_hour = widgets.IntSlider(min=0, max=23, value=7, description='Start hour:')
    
    end_hour = widgets.IntSlider(min=0, max=23, value=8, description='End hour:')
    
    field_select = widgets.Dropdown(options=numeric_cols, description='Field to analyze:')
    
    compare_button = widgets.Button(description='Compare', button_style='info')
    
    output = widgets.Output()
    
    def on_compare(b):
        with output:
            output.clear_output()
            
            base_date_val = base_date.value
            start_hour_val = start_hour.value
            end_hour_val = end_hour.value
            selected_field = field_select.value
            
            if start_hour_val >= end_hour_val:
                print("Start hour must be before end hour.")
                return
            
            try:
                date_1 = base_date_val
                date_2 = base_date_val + datetime.timedelta(days=1)
                label_1 = date_1.strftime('%Y-%m-%d')
                label_2 = date_2.strftime('%Y-%m-%d')

                def extract_10min_data(date):
                    start_dt = pd.to_datetime(f"{date} {start_hour_val}:00:00").tz_localize('UTC')
                    end_dt = pd.to_datetime(f"{date} {end_hour_val}:00:00").tz_localize('UTC')
                    mask = (df['__datetime__'] >= start_dt) & (df['__datetime__'] <= end_dt)
                    
                    sub_df = df.loc[mask, ['__datetime__', selected_field]].copy()
                    sub_df['time_interval'] = sub_df['__datetime__'].dt.floor('10min')

                    grouped = (
                        sub_df
                        .groupby('time_interval')[selected_field]
                        .mean()
                        .reset_index()
                        .rename(columns={selected_field: 'value'})
                    )

                    grouped['minute'] = (grouped['time_interval'] - start_dt).dt.total_seconds() / 60
                    grouped['time_label'] = grouped['time_interval'].dt.strftime('%H:%M')
                    grouped['day'] = date.strftime('%Y-%m-%d')
                    return grouped

                df1 = extract_10min_data(date_1)
                df2 = extract_10min_data(date_2)

                
                plot_data_list = []
          
                # df1
                if not df1.empty:
                    if len(df1) > 1:
                        r1, p1 = pearsonr(df1['minute'], df1['value'])
                        print(f"{label_1} - Pearson correlation (10-min): r = {r1:.3f}, p = {p1:.3f}")
                    else:
                        print(f"{label_1} - Not enough data for correlation.")
                    plot_data_list.append(df1)
                else:
                    print(f"{label_1} - No data available in selected time window.")

                # df2 
                if not df2.empty:
                    if len(df2) > 1:
                        r2, p2 = pearsonr(df2['minute'], df2['value'])
                        print(f"{label_2} - Pearson correlation (10-min): r = {r2:.3f}, p = {p2:.3f}")
                    else:
                        print(f"{label_2} - Not enough data for correlation.")
                    plot_data_list.append(df2)
                else:
                    print(f"{label_2} - No data available in selected time window.")

                if plot_data_list:
                    combined = pd.concat(plot_data_list, ignore_index=True)
                    plt.figure(figsize=(14, 6))
                    sns.lineplot(data=combined, x='minute', y='value', hue='day', marker='o', linewidth=2)

                    total_minutes = (end_hour_val - start_hour_val) * 60
                    x_ticks = np.arange(0, total_minutes + 1, 10)
                    x_labels = [f"{start_hour_val + (x // 60)}:{x % 60:02d}" for x in x_ticks]

                    plt.xticks(ticks=x_ticks, labels=x_labels, rotation=45)
                    plt.xlabel(f'Time of Day ({start_hour_val}:00 – {end_hour_val}:00)')
                    plt.ylabel(selected_field)
                    plt.title(f'{selected_field} (10-min avg) vs Time')
                    plt.grid(True, alpha=0.3)
                    plt.legend(title='Date', bbox_to_anchor=(1.05, 1), loc='upper left')
                    plt.tight_layout()
                    plt.show()
                else:
                    print("No data available for plotting.")

            except Exception as e:
                print(f"Error during analysis: {e}")
    
    compare_button.on_click(on_compare)
    
    display(widgets.VBox([
        widgets.HTML("<h2>Compare time window</h2>"),
        widgets.HBox([base_date, start_hour, end_hour]),
        field_select,
        compare_button,
        output
    ]))


In [69]:
compare_time_window('2881821.csv', date_col='created_at')

VBox(children=(HTML(value='<h2>Compare time window</h2>'), HBox(children=(DatePicker(value=datetime.date(2025,…