In [None]:
import csv
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import textwrap

In [None]:
# Find repo root even if notebook is inside /notebooks
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

In [None]:
def analyze_csv_ranges(file_path,delim=","):
    """
    Analyze a CSV file to separate columns and find ranges for all columns.
    For numeric columns: calculates min, max, range
    For text columns: shows unique count and sample values
    """
    
    # Read the CSV file
    try:
        df = pd.read_csv(file_path,delimiter=delim)
    except Exception as e:
        print(f"Error reading file: {e}")
        return
    
    print(df.describe())
    print("-" * 80)
    print(f"File: {file_path}")
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    print("-" * 80)


    for i, column in enumerate(df.columns):
        print("-" * 80)
        print(f"Column {i}: {column}")
        print("-" * 80)
        col_data = df[column]
        first_data = 0
        for n in range(len(col_data)):
            if pd.isna(col_data[n]):
                first_data = n
            else: break

        if pd.api.types.is_numeric_dtype(col_data): # Check if data is numeric
            clean_data = col_data.dropna()
            if len(clean_data)>0:
                min_val = clean_data.min()
                max_val = clean_data.max()
                print(f"Minimim Value: {min_val}")
                print(f"Maximum Value: {max_val}")
                print(f"Mean: {clean_data.mean():.2f}")
            else:
                print("All NaN or empty")
        else: 
            print("Type: Text/Categorcal")

        print(f"Preview of Column:")
        print(col_data[:3])
        print(col_data[-3:])
        print(f"Total values: {len(col_data)}")
        print(f"Unique values: {col_data.nunique()}")
        print(f"Missing values: {col_data.isna().sum()}")
        print(f'First value at: row number: {first_data}, on the {df["ts"][first_data]}')
        print(f"Fraction of No Readings {col_data.isna().sum()/len(col_data)} ")

    
    return df

In [None]:
df = analyze_csv_ranges(DATA_DIR / 'combined_data.csv')

In [None]:
df.hist(figsize=(12, 12), bins=20)

In [None]:
df.drop(columns='ts').corr()

In [None]:
#dataframe with no zeros, i.e. positive or zero generation only

df_positive_generation =  analyze_csv_ranges(DATA_DIR / 'combined_data_positive_gen.csv')

In [None]:
axes = df_positive_generation.hist(figsize=(12, 12), bins=20)

for ax in axes.flatten():
    title = ax.get_title()
    
    # Wrap title after certain character width
    wrapped_title = "\n".join(textwrap.wrap(title, width=20))
    
    ax.set_title(wrapped_title, fontsize=11)
    #ax.set_xlabel("Power [kW]", fontsize=10)
    ax.set_ylabel("Frequency", fontsize=10)
    
    # Improve tick readability
    ax.tick_params(axis='both', labelsize=9)

plt.tight_layout()
plt.show()

In [None]:
df_positive_generation.drop(columns='ts').corr()

In [None]:
analyze_csv_ranges(DATA_DIR / f'SyslabWind_15min.csv')

### I would chose the Aircon instead of Gaia, since Aircon has 10% no readings and Gaia has 25% no readings.
larger breaks: 
- July 30 - Aug 4
- Aug 8-22
- Oct 16-17
- and every once in a while theres a hole for a few hours 

In [None]:
analyze_csv_ranges(DATA_DIR/'SyslabWeather_15min.csv')