In [1]:
import pandas as pd
import multiprocessing
import os
import glob
import concurrent.futures
import pickle

In [2]:
def read_file(file_path):
    file_extension = os.path.splitext(file_path)[1][1:].lower()  # חותכים את ה- "." מהסיומת
    read_func = getattr(pd, f"read_{file_extension}")
    return read_func(file_path)
    
def save_file(df, file_path):
    file_extension = os.path.splitext(file_path)[1][1:].lower()  # חותכים את ה- "." מהסיומת
    save_func = getattr(df, f"to_{file_extension}")
    save_func(file_path)

In [3]:
df = read_file('./time_series.csv')
df.replace("not_a_number", pd.NA, inplace=True)
df = df.dropna()
df = df.dropna(subset=["value"])

In [4]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M')

for day, group in df.groupby(df["timestamp"].dt.day):
    filename = f"data_day_{day:02d}.csv" 
    group.to_csv(filename, index=False)
    print(f"Saved {filename}")


Saved data_day_01.csv
Saved data_day_02.csv
Saved data_day_03.csv
Saved data_day_04.csv
Saved data_day_05.csv
Saved data_day_06.csv
Saved data_day_07.csv
Saved data_day_08.csv
Saved data_day_09.csv
Saved data_day_10.csv
Saved data_day_11.csv
Saved data_day_12.csv
Saved data_day_13.csv
Saved data_day_14.csv
Saved data_day_15.csv
Saved data_day_16.csv
Saved data_day_17.csv
Saved data_day_18.csv
Saved data_day_19.csv
Saved data_day_20.csv
Saved data_day_21.csv
Saved data_day_22.csv
Saved data_day_23.csv
Saved data_day_24.csv
Saved data_day_25.csv
Saved data_day_26.csv
Saved data_day_27.csv
Saved data_day_28.csv
Saved data_day_29.csv
Saved data_day_30.csv


In [5]:
import pandas as pd

def process_hourly_average(day):
    try:
        filename = f"data_day_{day:02d}.csv"
        df_day = read_file(filename)

        df_day["timestamp"] = pd.to_datetime(df_day["timestamp"],errors='coerce')

        df_day["hour"] = df_day["timestamp"].dt.strftime("%Y-%m-%d %H:00")  # פורמט YYYY-MM-DD HH:00

        hourly_avg = df_day.groupby("hour")["value"].mean().reset_index()

        all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
        all_hours_df = pd.DataFrame(all_hours, columns=["hour"])

        hourly_avg = pd.merge(all_hours_df, hourly_avg, on="hour", how="left")
        hourly_avg["value"].fillna(0, inplace=True)  

        avg_filename = f"hourly_avg_day_{day:02d}.csv"
        hourly_avg.to_csv(avg_filename, index=False)
        print(f"Processed hourly averages for {filename} -> {avg_filename}")
    except Exception as e:
        print(f"Error processing {day}: {e}")

       
# df = pd.read_csv("data.csv") 
df["timestamp"] = pd.to_datetime(df["timestamp"])
unique_days = df["timestamp"].dt.day.unique()
print(unique_days)
print(pickle.dumps(process_hourly_average))  

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(process_hourly_average, unique_days, chunksize=1)

def load_csv(filename):
    return read_file(filename)

hourly_avg_files = [f"hourly_avg_day_{day:02d}.csv" for day in unique_days]
df_combined = pd.concat(map(load_csv, hourly_avg_files), ignore_index=True)

df_combined["hour"] = pd.to_datetime(df_combined["hour"], errors="coerce")
df_combined["hour_only"] = df_combined["hour"].dt.hour

df_grouped = df_combined.groupby("hour_only").agg({
    "hour":"min",
    "value": "mean" 
}).reset_index()

df_combined["hour"] = df_combined["hour"].dt.strftime("%Y-%m-%d %H:00")  
df_grouped = df_grouped.drop(columns=["hour_only"])

save_file(df_grouped, "final_hourly_averages.csv")
print("Merged all hourly averages into final_hourly_averages.csv")

[28  1 10 23  5 26  6  7 19 13  4 14 20 16 24 15  3 27  8 12 25 29 30 22
 18 21  2 11 17  9]
b"\x80\x04\x95'\x00\x00\x00\x00\x00\x00\x00\x8c\x08__main__\x94\x8c\x16process_hourly_average\x94\x93\x94."
Processed hourly averages for data_day_14.csv -> hourly_avg_day_14.csv
Processed hourly averages for data_day_20.csv -> hourly_avg_day_20.csv
Processed hourly averages for data_day_07.csv -> hourly_avg_day_07.csv
Processed hourly averages for data_day_13.csv -> hourly_avg_day_13.csv
Processed hourly averages for data_day_10.csv -> hourly_avg_day_10.csv
Processed hourly averages for data_day_19.csv -> hourly_avg_day_19.csv
Processed hourly averages for data_day_16.csv -> hourly_avg_day_16.csv
Processed hourly averages for data_day_05.csv -> hourly_avg_day_05.csv
Processed hourly averages for data_day_06.csv -> hourly_avg_day_06.csv
Processed hourly averages for data_day_26.csv -> hourly_avg_day_26.csv
Processed hourly averages for data_day_23.csv -> hourly_avg_day_23.csv
Processed hourly a