In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Exploration Notebook\n",
    "\n",
    "This notebook is intended for exploring the data generated and analyzed by the LithiumVision workflow.\n",
    "\n",
    "## Potential Tasks:\n",
    "1. Load `analyzed_structures.csv` or `top_candidates.csv`.\n",
    "2. Perform statistical analysis on properties like `energy_above_hull`, `formation_energy_per_atom`.\n",
    "3. Visualize distributions of various properties (histograms, scatter plots).\n",
    "4. Explore relationships between generated conditions and output properties.\n",
    "5. Investigate specific chemical systems in more detail."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "\n",
    "# Configure matplotlib for inline plotting\n",
    "%matplotlib inline\n",
    "sns.set_theme(style=\"whitegrid\")\n",
    "\n",
    "# Define paths (adjust as needed)\n",
    "project_root = Path('.').resolve().parent # Assuming this notebook is in 'LithiumVision/notebooks/'\n",
    "results_dir = project_root / 'results'\n",
    "data_dir = project_root / 'data'\n",
    "\n",
    "# Example: Load top candidates data\n",
    "top_candidates_file = results_dir / 'candidates' / 'top_candidates.csv'\n",
    "if top_candidates_file.exists():\n",
    "    df_candidates = pd.read_csv(top_candidates_file)\n",
    "    print(\"Top Candidates Data:\")\n",
    "    print(df_candidates.head())\n",
    "    print(\"\\nShape:\", df_candidates.shape)\n",
    "    print(\"\\nInfo:\")\n",
    "    df_candidates.info()\n",
    "else:\n",
    "    print(f\"File not found: {top_candidates_file}\")\n",
    "\n",
    "# Example: Load an analyzed_structures.csv file (you'll need to specify which one)\n",
    "# analyzed_file = data_dir / 'analyzed' / 'some_generation_run' / 'analyzed_structures.csv'\n",
    "# if analyzed_file.exists():\n",
    "#     df_analyzed = pd.read_csv(analyzed_file)\n",
    "#     print(\"\\nAnalyzed Structures Data (Example):\")\n",
    "#     print(df_analyzed.head())\n",
    "# else:\n",
    "#     print(f\"File not found: {analyzed_file}\")\n",
    "\n",
    "# Placeholder for further exploration\n",
    "if 'df_candidates' in locals() and not df_candidates.empty:\n",
    "    # Plot distribution of e_above_hull for candidates\n",
    "    if 'e_above_hull' in df_candidates.columns:\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        sns.histplot(df_candidates['e_above_hull'].dropna(), kde=True)\n",
    "        plt.title('Distribution of Energy Above Hull for Top Candidates')\n",
    "        plt.xlabel('Energy Above Hull (eV/atom)')\n",
    "        plt.ylabel('Frequency')\n",
    "        plt.show()\n",
    "\n",
    "    # Display summary statistics\n",
    "    print(\"\\nSummary statistics for top candidates:\")\n",
    "    print(df_candidates.describe())\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}