In [4]:
import os
import pandas as pd
import requests # For fetching content from URLs
from pymatgen.core import Structure
from pymatgen.io.vasp.inputs import Poscar # For parsing POSCAR content
from typing import Dict, Optional

def parse_poscar_text_content(
    poscar_string_content: str,
    source_identifier: str = "N/A"
) -> Optional[Dict]:
    """
    Parses the string content of a single POSCAR.

    Args:
        poscar_string_content (str): The string content of the POSCAR.
        source_identifier (str): An identifier for this structure (e.g., URL or row info).

    Returns:
        Optional[Dict]: A dictionary with parsed structure information or None if parsing fails.
    """
    try:
        if not isinstance(poscar_string_content, str) or not poscar_string_content.strip():
            print(f"Info: POSCAR content for '{source_identifier}' is empty or not a string. Skipping.")
            return None

        poscar_obj = Poscar.from_string(poscar_string_content)
        structure = poscar_obj.structure

        return {
            "source_identifier": source_identifier,
            "comment": poscar_obj.comment,
            "formula": structure.composition.reduced_formula,
            "natoms": len(structure),
            "lattice_vectors": structure.lattice.matrix.tolist(),
            "sites": [{"species": site.species_string,
                       "xyz_frac": site.frac_coords.tolist(),
                       "xyz_cart": site.coords.tolist()}
                      for site in structure.sites],
            "selective_dynamics": poscar_obj.selective_dynamics,
            "velocities": poscar_obj.velocities.tolist() if poscar_obj.velocities is not None else None,
            "pymatgen_structure": structure # Store the full object for further use
        }
    except Exception as e:
        print(f"Error parsing POSCAR content from '{source_identifier}': {e}")
        # To see the problematic content, you could uncomment the next line
        # print(f"Content snippet:\n---\n{poscar_string_content[:500]}\n---")
        return None

def process_first_poscar_link_from_csv(
    csv_filepath: str,
    link_column_name: str = 'POSCAR_link'
) -> Optional[Dict]:
    """
    Reads the first row of a local CSV, gets a URL from the specified column,
    fetches POSCAR data from that URL, parses it, and returns the data.

    Args:
        csv_filepath (str): Path to the local CSV file.
        link_column_name (str): The name of the column containing the URL to the POSCAR.

    Returns:
        Optional[Dict]: Parsed POSCAR data from the first row's link, or None if any step fails.
    """
    if not os.path.exists(csv_filepath):
        print(f"Error: CSV file not found at '{csv_filepath}'")
        return None

    try:
        df = pd.read_csv(csv_filepath, nrows=1) # Read only the first data row (plus header)
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file '{csv_filepath}' is empty or has no data rows.")
        return None
    except Exception as e:
        print(f"Error reading CSV file '{csv_filepath}': {e}")
        return None

    if df.empty:
        print(f"Warning: No data rows found in CSV file '{csv_filepath}'.")
        return None

    if link_column_name not in df.columns:
        print(f"Error: Column '{link_column_name}' not found in the CSV file.")
        print(f"Available columns: {df.columns.tolist()}")
        return None

    # Get the URL from the first row and the specified column
    poscar_url = df.iloc[0][link_column_name]

    if pd.isna(poscar_url) or not isinstance(poscar_url, str) or not poscar_url.strip():
        print(f"Error: URL in column '{link_column_name}' for the first row is missing, not a string, or empty.")
        return None

    print(f"Fetching POSCAR data from URL found in first row: {poscar_url}")

    try:
        response = requests.get(poscar_url, timeout=30) # Added a timeout
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
        poscar_text = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from URL '{poscar_url}': {e}")
        return None
    except Exception as e: # Catch any other unexpected error during fetch
        print(f"An unexpected error occurred while fetching from URL '{poscar_url}': {e}")
        return None

    if not poscar_text.strip():
        print(f"Warning: Content fetched from URL '{poscar_url}' is empty.")
        return None

    print(f"Successfully fetched content from {poscar_url}. Attempting to parse as POSCAR...")
    parsed_data = parse_poscar_text_content(poscar_text, source_identifier=poscar_url)

    return parsed_data


if __name__ == '__main__':
    # IMPORTANT: Replace this with the actual path to YOUR local CSV file
    your_local_csv_path = "/Users/abiralshakya/Downloads/materials_database.csv"
    # The column in your CSV that contains the direct links to POSCAR files
    link_column = 'POSCAR_link'

    if your_local_csv_path == "path/to/your/local/file.csv" or not os.path.exists(your_local_csv_path):
        print(f"Please update the 'your_local_csv_path' variable in the script (currently: '{your_local_csv_path}') with the correct path to your .csv file.")
        print("Script will exit as a valid CSV path is not provided.")
    else:
        print(f"--- Processing CSV file: '{your_local_csv_path}' ---")
        print(f"--- Target column for POSCAR URLs: '{link_column}' ---")
        print(f"--- Attempting to process the link from the FIRST ROW only ---")

        extracted_poscar_data = process_first_poscar_link_from_csv(
            your_local_csv_path,
            link_column_name=link_column
        )

        if extracted_poscar_data:
            print("\n--- Successfully Extracted and Parsed POSCAR Data from First Row's Link ---")
            print(f"Source URL: {extracted_poscar_data['source_identifier']}")
            print(f"Comment: {extracted_poscar_data['comment']}")
            print(f"Formula: {extracted_poscar_data['formula']}")
            print(f"Number of atoms: {extracted_poscar_data['natoms']}")
            print(f"Lattice Vectors (first vector): {extracted_poscar_data['lattice_vectors'][0]}")
            if extracted_poscar_data['sites']:
                print(f"First site: {extracted_poscar_data['sites'][0]['species']} @ {extracted_poscar_data['sites'][0]['xyz_frac']}")
            else:
                print("Sites: No site data found/parsed.")
            # You can print more details from extracted_poscar_data if needed
            # print(f"Full Pymatgen Structure Object: {extracted_poscar_data['pymatgen_structure']}") # This can be very verbose
        else:
            print("\nNo POSCAR data was extracted or an error occurred during processing the first row's link.")

--- Processing CSV file: '/Users/abiralshakya/Downloads/materials_database.csv' ---
--- Target column for POSCAR URLs: 'POSCAR_link' ---
--- Attempting to process the link from the FIRST ROW only ---
Fetching POSCAR data from URL found in first row: https://www.topologicalquantumchemistry.com/data/nonalloycompounds/SG-165/Mat-3/POSCAR
Error fetching content from URL 'https://www.topologicalquantumchemistry.com/data/nonalloycompounds/SG-165/Mat-3/POSCAR': HTTPSConnectionPool(host='www.topologicalquantumchemistry.com', port=443): Max retries exceeded with url: /data/nonalloycompounds/SG-165/Mat-3/POSCAR (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))

No POSCAR data was extracted or an error occurred during processing the first row's link.
