In [4]:
import pandas as pd
import time
from datetime import datetime
import numpy as np
from pathlib import Path


def log_timing(operation_name, start_time, timing_data):
    end_time = time.time()
    duration = end_time - start_time
    timing_data.append(f"{operation_name}: {duration:.2f} seconds")
    return end_time


def main():
    # Initialize timing data
    timing_data = []
    overall_start = time.time()

    # Read CSV
    print("Starting Pandas analysis...")
    start_time = time.time()
    df = pd.read_csv("data/employee_records.csv")
    current_time = log_timing("Read CSV", start_time, timing_data)

    # Query 1: Average salary by department and experience level
    start_time = current_time
    dept_exp_salary = (
        df.groupby(["department", "experience_level"])["salary"]
        .agg(["count", "mean", "std", "min", "max"])
        .round(2)
        .reset_index()
    )
    current_time = log_timing(
        "Query 1: Department-Experience Level Salary Analysis", start_time, timing_data
    )

    # Query 2: Employee retention analysis (time since joining)
    start_time = current_time
    df["join_date"] = pd.to_datetime(df["join_date"])
    df["tenure_days"] = (pd.Timestamp.now() - df["join_date"]).dt.days
    retention_analysis = (
        df.groupby("department")
        .agg({"tenure_days": ["mean", "min", "max", "count"], "salary": "mean"})
        .round(2)
    )
    current_time = log_timing(
        "Query 2: Employee Retention Analysis", start_time, timing_data
    )

    # Query 3: Complex performance metrics
    start_time = current_time
    performance_metrics = (
        df.assign(
            is_high_performer=lambda x: x["last_rating"].isin(
                ["Exceptional", "Exceeds Expectations"]
            ),
            salary_tier=pd.qcut(df["salary"], q=4, labels=["Q1", "Q2", "Q3", "Q4"]),
        )
        .groupby(["department", "salary_tier"])
        .agg(
            {
                "is_high_performer": "mean",
                "projects_completed": ["mean", "sum"],
                "remote_work_eligible": "mean",
                "bonus_eligible": "mean",
            }
        )
        .round(3)
    )
    current_time = log_timing(
        "Query 3: Performance Metrics Analysis", start_time, timing_data
    )

    # Query 4: Location and compensation analysis
    start_time = current_time
    location_comp = (
        df.groupby(["country", "office"])
        .agg(
            {
                "salary": ["mean", "std", "count"],
                "stock_options": ["mean", "sum"],
                "bonus_eligible": "mean",
            }
        )
        .round(2)
    )
    current_time = log_timing(
        "Query 4: Location Compensation Analysis", start_time, timing_data
    )

    # Query 5: Advanced filtering and window calculations
    start_time = current_time
    df["salary_rank"] = df.groupby("department")["salary"].rank(
        method="dense", ascending=False
    )
    top_performers = (
        df[
            (df["last_rating"].isin(["Exceptional", "Exceeds Expectations"]))
            & (df["salary_rank"] <= 10)
        ]
        .groupby("department")
        .agg(
            {"salary": ["count", "mean", "max"], "projects_completed": ["mean", "sum"]}
        )
        .round(2)
    )
    current_time = log_timing(
        "Query 5: Top Performers Analysis", start_time, timing_data
    )

    # Calculate overall execution time
    overall_duration = time.time() - overall_start
    timing_data.append(f"\nTotal execution time: {overall_duration:.2f} seconds")

    # Save timing results
    output_dir = Path("performance_results")
    output_dir.mkdir(exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    with open(output_dir / f"pandas_timing_{timestamp}.txt", "w") as f:
        f.write("\n".join(timing_data))

    # Save analysis results
    results_dir = Path("analysis_results/pandas")
    results_dir.mkdir(parents=True, exist_ok=True)

    dept_exp_salary.to_csv(results_dir / "dept_exp_salary.csv", index=False)
    retention_analysis.to_csv(results_dir / "retention_analysis.csv")
    performance_metrics.to_csv(results_dir / "performance_metrics.csv")
    location_comp.to_csv(results_dir / "location_comp.csv")
    top_performers.to_csv(results_dir / "top_performers.csv")

    print(f"Analysis complete. Results saved in {results_dir}")
    print(f"Timing results saved in {output_dir}")


if __name__ == "__main__":
    main()

Starting Pandas analysis...
