# Data Quality Analysis Notebook

This notebook demonstrates data quality monitoring and analysis techniques for our data pipeline.

In [None]:
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
import boto3
import great_expectations as ge

## 1. Data Freshness Analysis

In [None]:
def analyze_data_freshness(table_name):
    query = f"""
    SELECT 
        date_trunc('hour', created_at) as hour,
        count(*) as record_count
    FROM {table_name}
    WHERE created_at >= current_date - interval '7 days'
    GROUP BY 1
    ORDER BY 1
    """
    
    # Execute query and create visualization
    df = pd.read_sql(query, connection)
    fig = px.line(df, x='hour', y='record_count', title=f'Data Freshness - {table_name}')
    return fig

## 2. Data Quality Metrics

In [None]:
def calculate_quality_metrics(df):
    metrics = {
        'total_rows': len(df),
        'null_percentage': df.isnull().mean() * 100,
        'duplicate_percentage': df.duplicated().mean() * 100,
        'unique_values': df.nunique()
    }
    return pd.DataFrame(metrics)

## 3. Anomaly Detection

In [None]:
def detect_anomalies(df, column, window=24):
    rolling_mean = df[column].rolling(window=window).mean()
    rolling_std = df[column].rolling(window=window).std()
    
    anomalies = df[abs(df[column] - rolling_mean) > (3 * rolling_std)]
    return anomalies