In [2]:
from google.colab import drive
drive.mount("/content/drive")


project_path = "/content/drive/MyDrive/Colab Notebooks/report"
file_path = "/content/drive/MyDrive/Colab Notebooks/report/data"
module_path = "/content/drive/MyDrive/Colab Notebooks/report/modules"
utils_path = "/content/drive/MyDrive/Colab Notebooks/report/utils"
tests_path = "/content/drive/MyDrive/Colab Notebooks/report/tests"

Mounted at /content/drive


In [3]:
%%writefile "{project_path}/config/config.json"
{
    "OPENAI_API_KEY": "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",
    "MODEL_NAME": "x-ai/grok-4.1-fast"
}


Overwriting /content/drive/MyDrive/Colab Notebooks/report/config/config.json


In [4]:
%%writefile "{utils_path}/overview_generator.py"
import json

class OverviewGenerator:
    """
    Responsible for:
    - Preparing the Overview prompt
    - Calling LLMClient to get the Overview text
    """

    def __init__(self, llm_client):
        self.llm_client = llm_client

        self.system_prompt = """
        You are a data analyst assistant.
        You are given a dataset profile in JSON format, which includes:
        - metadata: general info (rows, columns, missing values)
        - schema: column-level info (name, type, unique values, sample values)
        - sample_rows: few example rows
        - statistics: numeric/categorical summaries

        Your task:
        1. Generate a short, clear overview (3-5 sentences) describing the dataset.
        2. Focus on the meaning and context: what the dataset represents, what the entities/records are.
        3. Include only the most important insights.
        4. Mention missing values or imbalanced target if relevant.
        5. Format: a short paragraph + a bullet list of key insights.
        6. Return ONLY the formatted text.
        """

    def generate(self, dataset_profile: dict) -> str:
        """
        Takes the full dataset profile dict and returns
        the LLM-generated overview text.
        """

        user_prompt = f"Dataset profile:\n{json.dumps(dataset_profile, separators=(',', ':'))}"

        return self.llm_client.chat(
            system_prompt=self.system_prompt,
            user_prompt=user_prompt,
            temperature=0
        )


Overwriting /content/drive/MyDrive/Colab Notebooks/report/utils/overview_generator.py


In [5]:
%%writefile "{utils_path}/data_health_base.py"
from abc import ABC, abstractmethod

class BaseHealthCheck(ABC):
    """Abstract class for all data health checks."""

    @abstractmethod
    def run(self, df) -> "HealthCheckResult":
        """Run the health check and return a HealthCheckResult."""
        raise NotImplementedError


class HealthCheckResult:
    """Holds the output of a health check."""

    def __init__(self, name: str, status: str, details: dict):
        self.name = name
        self.status = status  # 'healthy' | 'warning' | 'critical'
        self.details = details

    def __repr__(self):
        return f"{self.name} ({self.status}) -> {self.details}"


class HealthReport:
    """Collect results from multiple health checks."""

    def __init__(self):
        self.results = []

    def add(self, result: HealthCheckResult):
        self.results.append(result)

    def to_dict(self):
        return {
            "checks": [
                {"name": r.name, "status": r.status, "details": r.details}
                for r in self.results
            ]
        }

    def __repr__(self):
        return "\n".join([repr(r) for r in self.results])


class HealthValidator:
    """Coordinator to run multiple health checks."""

    def __init__(self, checks: list):
        self.checks = checks

    def run(self, df) -> HealthReport:
        report = HealthReport()
        for check in self.checks:
            result = check.run(df)
            report.add(result)
        return report

Overwriting /content/drive/MyDrive/Colab Notebooks/report/utils/data_health_base.py


In [23]:
%%writefile "{utils_path}/checks.py"
from utils.data_health_base import BaseHealthCheck, HealthCheckResult
import pandas as pd

class EmptyDatasetCheck(BaseHealthCheck):
    """
    Checks whether the dataset is empty:
    - No rows
    - OR zero columns
    """

    def run(self, df: pd.DataFrame) -> HealthCheckResult:

        is_empty = df.empty
        row_count = len(df)
        column_count = len(df.columns)

        if is_empty:
            status = "critical"
            message = "Dataset is completely empty. No analysis can be performed."
        elif row_count == 0:
            status = "critical"
            message = "Dataset has columns but contains zero rows."
        else:
            status = "healthy"
            message = "Dataset has valid rows and columns."

        return HealthCheckResult(
            name="Empty Dataset Check",
            status=status,
            details={
                "row_count": row_count,
                "column_count": column_count,
                "is_empty": is_empty,
                "message": message
            }
        )

class NullRatioCheck(BaseHealthCheck):
    """Calculates the null percentage for each column."""

    def __init__(self, warning_threshold=0.2, critical_threshold=0.5):
        self.warning_threshold = warning_threshold
        self.critical_threshold = critical_threshold

    def run(self, df) -> HealthCheckResult:
        null_ratios = df.isna().mean().to_dict()
        max_null = max(null_ratios.values()) if null_ratios else 0

        if max_null >= self.critical_threshold:
            status = "critical"
        elif max_null >= self.warning_threshold:
            status = "warning"
        else:
            status = "healthy"

        return HealthCheckResult(
            name="Null Ratio Check",
            status=status,
            details={
                "null_ratio_per_column": null_ratios,
                "warning_threshold": self.warning_threshold,
                "critical_threshold": self.critical_threshold
            }
        )

class DuplicateRowsCheck(BaseHealthCheck):
    """
    Detects duplicate rows in the dataset.
    Returns:
        - duplicate_count
        - duplicate_percentage
        - sample_duplicates (first 5 duplicate rows)
    """

    def __init__(self, warning_threshold=0.05, critical_threshold=0.2):
        self.warning_threshold = warning_threshold
        self.critical_threshold = critical_threshold

    def run(self, df: pd.DataFrame) -> HealthCheckResult:

        duplicate_mask = df.duplicated()
        duplicate_count = int(duplicate_mask.sum())
        duplicate_percentage = float(duplicate_count / len(df)) if len(df) > 0 else 0

        # status decision
        if duplicate_percentage >= self.critical_threshold:
            status = "critical"
        elif duplicate_percentage >= self.warning_threshold:
            status = "warning"
        else:
            status = "healthy"

        # get small sample of duplicate rows
        sample_duplicates = df[duplicate_mask].head().to_dict(orient="records")

        return HealthCheckResult(
            name="Duplicate Rows Check",
            status=status,
            details={
                "duplicate_count": duplicate_count,
                "duplicate_percentage": round(duplicate_percentage, 4),
                "warning_threshold": self.warning_threshold,
                "critical_threshold": self.critical_threshold,
                "sample_duplicates": sample_duplicates
            }
        )

class OutlierIQRCheck(BaseHealthCheck):
    """
    Detects outliers in numeric columns using the IQR method.
    """

    def __init__(self, warning_threshold=0.05, critical_threshold=0.1, sample_size=5):

        self.warning_threshold = warning_threshold
        self.critical_threshold = critical_threshold
        self.sample_size = sample_size

    def run(self, df: pd.DataFrame) -> HealthCheckResult:
        numeric_cols = df.select_dtypes(include='number').columns
        numeric_cols = [c for c in numeric_cols if df[c].nunique() > 2]  # فقط الأعمدة المتنوعة
        outlier_counts = {}
        outlier_samples = {}

        for col in numeric_cols:
            q1, q3 = df[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
            mask = (df[col] < lower) | (df[col] > upper)
            count = int(mask.sum())
            outlier_counts[col] = count
            outlier_samples[col] = df.loc[mask].head(self.sample_size).to_dict(orient='records')

        max_outlier_pct = max((count/len(df) for count in outlier_counts.values()), default=0)

        if max_outlier_pct >= self.critical_threshold:
            status = "critical"
        elif max_outlier_pct >= self.warning_threshold:
            status = "warning"
        else:
            status = "healthy"

        return HealthCheckResult(
            name="Outlier IQR Check",
            status=status,
            details={
                "outlier_count_per_column": outlier_counts,
                # "outlier_samples_per_column": outlier_samples,
                "warning_threshold": self.warning_threshold,
                "critical_threshold": self.critical_threshold
            }
        )

Overwriting /content/drive/MyDrive/Colab Notebooks/report/utils/checks.py


In [15]:
%%writefile "{utils_path}/data_health_generator.py"
import json

class DataHealthGenerator:
    """
    Responsible for:
    - Preparing the Data Health prompt
    - Calling LLMClient to generate the final report text
    """

    def __init__(self, llm_client):
        self.llm_client = llm_client

        self.system_prompt = """
        You are a senior data quality analyst.

        You will receive a JSON object that contains:
        - A list of data quality checks
        - Each check includes:
          - name: the check name
          - status: "healthy" | "warning" | "critical"
          - details: metrics and detected issues

        Your task:
        1. Analyze all checks and produce a clear, concise Data Quality Report.
        2. Keep the tone professional and factual.
        3. Highlight only the important issues (missing values, outliers, duplicates, inconsistent data types, empty dataset, rare categories…).
        4. Organize your answer in this format:

        ### Overall Quality Status:
        - A short (1–2 sentence) summary of the dataset's health.

        ### Key Issues:
        - Bullet points describing detected problems (from all checks).
        - Each point should explain:
          - what the issue is
          - where it occurs (columns)
          - why it matters

        ### Recommended Fixes:
        - For each issue category,
          give 1–2 practical suggestions for how to fix it.

        Output Format:
        - Clean markdown text only.
        - Do NOT generate JSON.
        - Do NOT invent issues not present in the input.
        """

    def generate(self, health_report: dict) -> str:
        """
        Takes the health report dict and returns
        LLM-generated data quality analysis.
        """

        user_prompt = (
            "Data Quality Checks JSON:\n"
            f"{json.dumps(health_report, separators=(',', ':'))}"
        )

        return self.llm_client.chat(
            system_prompt=self.system_prompt,
            user_prompt=user_prompt,
            temperature=0
        )


Writing /content/drive/MyDrive/Colab Notebooks/report/utils/data_health_generator.py


In [19]:
%%writefile "{module_path}/data_health_module.py"
from utils.checks import NullRatioCheck, OutlierIQRCheck, EmptyDatasetCheck, DuplicateRowsCheck
from utils.data_health_base import HealthValidator

from utils.data_health_generator import DataHealthGenerator
from utils.llm_client import LLMClient

class DataHealthModule:
    """
    Responsible for generating the final Overview section.
    Handles:
    1. Building dataset profile
    2. Connecting with LLM via OverviewGenerator
    3. Returning the final overview text
    """

    def __init__(self, llm_client: LLMClient):
        self.null_ratio_check = NullRatioCheck()
        self.outlier_check = OutlierIQRCheck()
        self.empty_dataset_check = EmptyDatasetCheck()
        self.duplicate_rows_check = DuplicateRowsCheck()
        self.health_validator = HealthValidator([
            self.null_ratio_check,
            self.outlier_check,
            self.empty_dataset_check,
            self.duplicate_rows_check
        ])
        self.data_health_generator = DataHealthGenerator(llm_client=llm_client)
    def get_data_health(self, df) -> str:
      """
      Main method to generate the Data Health section.

      Steps:
      1. Run all health checks using HealthValidator
      2. Convert the report to dict
      3. Pass it to DataHealthGenerator
      4. Return the final generated text
      """

      health_report = self.health_validator.run(df)

      health_dict = health_report.to_dict()

      health_text = self.data_health_generator.generate(health_dict)

      return health_text

Writing /content/drive/MyDrive/Colab Notebooks/report/modules/data_health_module.py


In [24]:
%%writefile "{project_path}/main.py"
import os
import sys
import pandas as pd
import json

project_path = "/content/drive/MyDrive/Colab Notebooks/report"
sys.path.append(project_path)

from utils.data_loader import CSVLoader
from utils.llm_client import LLMClient
from config.config import MODEL_NAME
from modules.overview_module import OverviewModule
from modules.data_health_module import DataHealthModule



pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def main():
  data_path = "/content/drive/MyDrive/Colab Notebooks/report/data/Housing.csv"


  loader = CSVLoader(data_path)
  df = loader.load()

  llm_client = LLMClient(model=MODEL_NAME)

  # overview_module = OverviewModule(llm_client=llm_client)
  # overview_text = overview_module.get_overview(df, file_path=data_path)
  # print(overview_text)

  data_health_module = DataHealthModule(llm_client=llm_client)
  data_health_text = data_health_module.get_data_health(df)
  print(data_health_text)

if __name__ == "__main__":
  main()

Overwriting /content/drive/MyDrive/Colab Notebooks/report/main.py


In [25]:
!python3 "{project_path}/main.py"

### Overall Quality Status:

### Key Issues:
- **Outliers detected via IQR method**: Present in price (15 outliers), area (12), bedrooms (12), bathrooms (1), stories (41), and parking (12) columns; these represent notable portions of the 545 rows and can distort statistical summaries, model training, and predictive accuracy.

### Recommended Fixes:
- **Outliers**:
  - Investigate outliers contextually (e.g., via domain knowledge or visualization) and consider removal, capping at IQR bounds, or transformation (e.g., log scaling) for skewed distributions.
  - Apply robust scaling methods like Winsorization to limit extreme values while retaining data volume.
