## Import necessary libraries

In [3]:
import pandas as pd
import numpy as np
import os
import glob

## Eagle-I data processing

In [None]:

from datetime import timedelta

# Path to the directory containing the CSV files
input_directory = r"F:\onedrive\OneDrive - University of Central Florida\combied_eaglei"
output_directory = r"F:\onedrive\OneDrive - University of Central Florida\compound_paper_data\SAIDI_raw"
customer_track = []
# Ensure the output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to process each file
def process_file(filepath):
    df = pd.read_csv(filepath, parse_dates=['run_start_time'])
    data_clean = df.dropna(axis=0)
    customer =data_clean['sum'].quantile(0.85)
    #print(f"Number of customers: {data_clean['sum'].quantile(0.85)}")
    data = data_clean[data_clean['sum'] > data_clean['sum'].quantile(0.85)].copy()
    data.sort_values(by='run_start_time', inplace=True)

    events = []
    current_event = []
    for index, row in data.iterrows():
        if current_event:
            if row['run_start_time'] - current_event[-1]['run_start_time'] <= timedelta(minutes=60):
                current_event.append(row)
            else:
                events.append(current_event)
                current_event = [row]
        else:
            current_event.append(row)

    if current_event:
        events.append(current_event)

    processed_events = []
    for event in events:
        event_df = pd.DataFrame(event)
        start_time = event_df['run_start_time'].min()
        end_time = event_df['run_start_time'].max()
        total_max = event_df['sum'].max()
        fips_code = event_df.iloc[0]['fips_code']  
        county = event_df.iloc[0]['county']
        total_cust = event_df.iloc[0]['Total Customers']
        processed_events.append({
            'start_time': start_time,
            'end_time': end_time,
            'total_max': total_max,
            'fips_code': fips_code,
            'county': county,
            'total_cust': total_cust
        })

    processed_events_df = pd.DataFrame(processed_events)
    processed_events_df['duration'] = (processed_events_df['end_time'] - processed_events_df['start_time']).dt.total_seconds() / 3600
    processed_events_df =  processed_events_df[processed_events_df['duration'] > 1]
    processed_events_df['cust_hour'] = processed_events_df['total_max'] * processed_events_df['duration']
    processed_events_df = processed_events_df.drop(['end_time', 'fips_code'], axis=1)
    processed_events_df.rename(columns={'start_time': 'Time'}, inplace=True)
    processed_events_df.set_index('Time', inplace=True)
    processed_events_df = processed_events_df.resample('D').sum(numeric_only=True)
    
    # Assign 'total_max' and 'total_cust' to each row
    processed_events_df['total_max'] = processed_events_df['total_max'].fillna(0)
    processed_events_df['total_cust'] = total_cust

    output_filepath = os.path.join(output_directory, f"{county}_daily_outage.csv")
    #processed_events_df.to_csv(output_filepath, index=True)
    customer_track.append({'anme': county, 'customers': customer})
    #print(f"Processed and saved data for {county}")

# Loop through each CSV file in the directory
for filepath in glob.glob(os.path.join(input_directory, "*.csv")):
    process_file(filepath)
