In [4]:
import pandas as pd
import io

# Load the dataset
df = pd.read_csv("data/druid_ready.csv")

# Open a text file to save the report
with open("eda_report.txt", "w", encoding="utf-8") as f:
    # Header
    f.write("Exploratory Data Analysis Report\n")
    f.write("=" * 40 + "\n\n")

    # First 5 rows
    f.write("First 5 rows of the dataset:\n")
    f.write(df.head().to_string() + "\n\n")

    # Dataset Info
    f.write("Dataset Info:\n")
    buffer = io.StringIO()
    df.info(buf=buffer)
    f.write(buffer.getvalue() + "\n")

    # Shape
    f.write(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.\n\n")

    # Column names
    f.write("Column names:\n")
    f.write(", ".join(df.columns.tolist()) + "\n\n")

    # Unique values
    f.write("Unique values in each column:\n")
    for col in df.columns:
        unique_vals = df[col].unique()
        f.write(f"{col}: {len(unique_vals)} unique values\n")
        if len(unique_vals) < 20:
            f.write(f"  Values: {unique_vals}\n")
    f.write("\n")

    # Summary statistics
    f.write("Summary statistics for numerical columns:\n")
    f.write(df.describe().to_string() + "\n\n")

    # Missing values
    f.write("Missing values in each column:\n")
    f.write(df.isnull().sum().to_string() + "\n\n")

    # Data types
    f.write("Data types of each column:\n")
    f.write(df.dtypes.to_string() + "\n\n")

    # Temperature unit clarification
    f.write("Note on 'Temp' column:\n")
    f.write("The values in the 'Temp' column appear to be in Kelvin (K), based on the observed range.\n")
    f.write("To convert to Celsius: Temp_C = Temp - 273.15\n")
