# Companies House EDA

In [6]:
from pathlib import Path
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
RAW_DIR = Path("../data/raw/ch")

In [7]:
def load_all_parts(raw_dir: Path) -> pd.DataFrame:
    frames = []
    for zip_path in sorted(raw_dir.glob("*.zip")):
        with zipfile.ZipFile(zip_path) as zf:
            csv_names = [n for n in zf.namelist() if n.endswith(".csv")]
            if not csv_names:
                continue
            with zf.open(csv_names[0]) as f:
                frames.append(pd.read_csv(f, dtype=str, low_memory=False))
    if frames:
        return pd.concat(frames, ignore_index=True)
    return pd.DataFrame()

In [8]:
df = load_all_parts(RAW_DIR)
print(f"Total records: {len(df):,}")

Total records: 5,655,315


In [None]:
if "IncorporationDate" in df.columns:
    dates = pd.to_datetime(df["IncorporationDate"], errors="coerce")
    print("Incorporation date range:", dates.min(), "to", dates.max())
    dates.value_counts().sort_index().plot()
    plt.title("Company incorporations over time")
    plt.xlabel("Date")
    plt.ylabel("Count")
    plt.tight_layout()

In [None]:
if "CompanyName" in df.columns:
    name_lengths = df["CompanyName"].astype(str).str.len()
    name_lengths.hist(bins=50)
    plt.title("Company name length distribution")
    plt.xlabel("Length")
    plt.ylabel("Frequency")
    plt.tight_layout()