In [None]:
import requests
import zipfile
import io
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
production_df = pd.read_excel("../data/raw/Gyongoshalasz_09-10_ AC_Production.xlsx")

In [None]:
production_df.head()

In [None]:
# Get the current column name and split it
current_col = production_df.columns[0]
header_parts = current_col.split(';')

# Split the data column by semicolon
split_data = production_df.iloc[:, 0].str.split(';', expand=True)

# Create new dataframe with proper column names
production_df_fixed = pd.DataFrame({
    'datetime': split_data[0],
    'power_kw': split_data[1]
})

# Remove the first row which contains format info [YYYY-MM-DD hh:mm];[kW]
production_df_fixed = production_df_fixed.iloc[1:].reset_index(drop=True)

# Convert datetime to proper datetime type
production_df_fixed['datetime'] = pd.to_datetime(production_df_fixed['datetime'])

# Convert power to numeric
production_df_fixed['power_kw'] = pd.to_numeric(production_df_fixed['power_kw'])

print("Fixed DataFrame:")
production_df_fixed.head()


In [None]:
production_df_fixed["power_mw"] = production_df_fixed["power_kw"] / 1000

In [None]:
production_df_fixed

In [None]:
# Check for missing timeseries data
import pandas as pd

# Set datetime as index for easier analysis
df_check = production_df_fixed.set_index('datetime').sort_index()

# Check for missing values
print("Missing values per column:")
print(df_check.isnull().sum())
print()

# Check for gaps in timeseries
print("Timeseries analysis:")
print(f"Date range: {df_check.index.min()} to {df_check.index.max()}")
print(f"Total records: {len(df_check)}")
print()

# Expected frequency (15-minute intervals based on your data)
expected_freq = '15T'
expected_range = pd.date_range(start=df_check.index.min(), end=df_check.index.max(), freq=expected_freq)

print(f"Expected records with 15min intervals: {len(expected_range)}")
print(f"Actual records: {len(df_check)}")
print(f"Missing records: {len(expected_range) - len(df_check)}")
print()

# Find missing timestamps
missing_times = expected_range.difference(df_check.index)
if len(missing_times) > 0:
    print(f"Missing timestamps ({len(missing_times)}):")
    print(missing_times[:10])  # Show first 10 missing
    if len(missing_times) > 10:
        print(f"... and {len(missing_times) - 10} more")
else:
    print("No missing timestamps found!")

# Check for duplicate timestamps
duplicates = df_check.index.duplicated()
if duplicates.any():
    print(f"\nDuplicate timestamps found: {duplicates.sum()}")
    print(df_check.index[duplicates])
else:
    print("\nNo duplicate timestamps found.")
