In [6]:
import os
import json
import unicodedata
import re
from lxml import etree
import csv

# === Constants ===
# Define the input directory where your XML files are located
INPUT_XML_DIR = "Elsevier_xml_data"
# Define the root output directory where DOI-named folders will be created
OUTPUT_ROOT_DIR = "elsevier_processed_articles"

# === Filename Generation ===
def doi_to_filename(doi):
    """Converts a DOI into a filesystem-safe filename."""
    return re.sub(r'[^\w\-_.]', '_', doi) + ".xml"

# === Namespaces for Elsevier XML ===
ns = {
    'ce': "http://www.elsevier.com/xml/common/elssce",
    'xocs': "http://www.elsevier.com/xml/xocs/dtd",
    'dc': "http://purl.org/dc/elements/1.1/",
    'prism': "http://prismstandard.org/namespaces/basic/2.0/"
}

# === Text Cleaning ===
def clean(text):
    """Cleans and normalizes text by stripping whitespace and normalizing unicode."""
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text.strip())
    return re.sub(r"\s+", " ", text)

def extract_texts(elem):
    """Extracts and cleans all text content from an XML element, including its descendants."""
    return clean(' '.join(t.strip() for t in elem.itertext() if t.strip()))

def clean_caption_by_removing_row_text(caption, rows):
    """
    Remove any table row content (as text) that accidentally appears in the caption.
    """
    caption_cleaned = caption
    for row in rows:
        row_text = " ".join(row)
        if row_text in caption_cleaned:
            caption_cleaned = caption_cleaned.replace(row_text, "")
    return clean(caption_cleaned)


# === Full Text Extraction Function ===
def extract_elsevier_article(xml_file_path):
    """
    Extracts metadata and sectional text from an Elsevier XML article.

    Args:
        xml_file_path (str): The path to the XML file.

    Returns:
        dict: A dictionary containing extracted DOI, title, abstract, and sections.
    """
    try:
        tree = etree.parse(xml_file_path)
        root = tree.getroot()
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return None

    # Metadata
    doi_element = root.find(".//prism:doi", namespaces=ns)
    doi = clean(doi_element.text) if doi_element is not None else "N/A"

    title_element = root.find(".//dc:title", namespaces=ns)
    title = clean(title_element.text) if title_element is not None else "N/A"

    abstract_element = root.find(".//dc:description", namespaces=ns)
    abstract = clean(abstract_element.text) if abstract_element is not None else "N/A"

    # Sequential Section Parsing
    sections = {}
    current_section = "Introduction"  # Default section if none found
    sections[current_section] = []

    for elem in root.iter():
        tag = etree.QName(elem.tag).localname

        if tag == "section-title":
            section_title_text = extract_texts(elem)
            if section_title_text:
                current_section = section_title_text
                if current_section not in sections:
                    sections[current_section] = []

        elif tag == "para":
            para_text = extract_texts(elem)
            if para_text:
                sections.setdefault(current_section, []).append(para_text)
        elif tag == "abstract": # Sometimes abstract content can be nested within <abstract> tags
            # If abstract was not found via dc:description, try to get it from here
            if abstract == "N/A":
                abstract = extract_texts(elem)

    return {
        "doi": doi,
        "title": title,
        "abstract": abstract,
        "sections": sections
    }

# === Table Extraction Function (Updated with your provided logic) ===
def extract_elsevier_tables_from_xml(xml_path, output_dir):
    """
    Extracts tables and their captions from an Elsevier XML article and saves them as CSV and TXT files.
    This function uses a heuristic approach by looking for captions and then immediately following
    structures that resemble table rows (elements containing 'entry', 'td', 'cell', 'data').

    Args:
        xml_path (str): The path to the XML file.
        output_dir (str): The directory where extracted tables and captions will be saved.

    Returns:
        list: A list of tuples, each containing (caption_file_path, csv_file_path) for saved tables.
    """
    os.makedirs(output_dir, exist_ok=True)

    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML file {xml_path}: {e}")
        return []

    # === Regex for loose caption matching
    caption_pattern = re.compile(r"\b(?:table|able)\s*\d+(?:[\.:])?", re.IGNORECASE)

    # === Main Loop: extract caption + following table
    captions = []
    tables_data = []

    current_caption = None
    current_table_rows = []
    skip_elem_id = None  # <-- track caption element so we don't parse it as data

    for elem in root.iter():
        text = extract_texts(elem)

        # Step 1: Match caption
        if caption_pattern.match(text) and len(text.split()) > 3:
            if current_caption and current_table_rows:
                captions.append(current_caption)
                tables_data.append(current_table_rows)

            current_caption = text
            current_table_rows = []

            # Store element's unique id so we can skip it in next pass
            skip_elem_id = id(elem)
            continue

        # Step 2: Extract row data if not the caption element
        if current_caption is not None and id(elem) != skip_elem_id:
            row_values = []
            for child in elem:
                tag = etree.QName(child.tag).localname.lower()
                if tag in {"entry", "td", "cell", "data"}:
                    value = extract_texts(child)
                    if value:
                        row_values.append(value)
            if row_values:
                if not any(re.search(r'\d', val) for val in row_values):  # skip purely non-numeric rows
                    continue
                current_table_rows.append(row_values)

    # Catch the last table if the loop finishes and there's un-saved data
    if current_caption and current_table_rows:
        captions.append(current_caption)
        tables_data.append(current_table_rows)

    # === Save to files ===
    saved_files_info = []
    for i, (caption, rows) in enumerate(zip(captions, tables_data), 1):
        # Clean the caption by removing any row content
        cleaned_caption = clean_caption_by_removing_row_text(caption, rows)

        # Create filename tag
        match_id = re.search(r'\d+', caption)
        if match_id:
            table_id = match_id.group(0)
            tag = f"Table_{table_id}"
        else:
            tag = f"Table_{i}"

        tag = re.sub(r'[^\w\-_.]', '_', tag)

        # Save caption
        caption_file = os.path.join(output_dir, f"{tag}_caption.txt")
        try:
            with open(caption_file, "w", encoding="utf-8") as f:
                f.write(cleaned_caption)
            print(f"   Saved caption: {caption_file}")
        except IOError as e:
            print(f"Error saving caption file {caption_file}: {e}")
            continue

        # Save CSV
        csv_file = os.path.join(output_dir, f"{tag}.csv")
        try:
            with open(csv_file, "w", encoding="utf-8", newline="") as f:
                writer = csv.writer(f)
                if rows:
                    writer.writerows(rows)
            print(f"   Saved table: {csv_file}")
            saved_files_info.append((caption_file, csv_file))
        except IOError as e:
            print(f"Error saving CSV file {csv_file}: {e}")
            continue


    if not saved_files_info:
        print(f"   No tables successfully extracted from: {xml_path}")
    return saved_files_info


if __name__ == "__main__":
    os.makedirs(INPUT_XML_DIR, exist_ok=True)
    os.makedirs(OUTPUT_ROOT_DIR, exist_ok=True)

    xml_files = [f for f in os.listdir(INPUT_XML_DIR) if f.endswith(".xml")]

    if not xml_files:
        print(f"No XML files found in '{INPUT_XML_DIR}'. Please place your XML files there.")
    else:
        print(f"Found {len(xml_files)} XML files in '{INPUT_XML_DIR}'. Starting processing...")

        for xml_filename in xml_files:
            xml_file_path = os.path.join(INPUT_XML_DIR, xml_filename)
            print(f"\nProcessing '{xml_filename}'...")

            # --- Extract Full Text ---
            article_data = extract_elsevier_article(xml_file_path)

            if article_data and article_data["doi"] != "N/A":
                # Create DOI-specific folder
                doi_folder_name = re.sub(r'[^\w\-_.]', '_', article_data["doi"])
                article_output_dir = os.path.join(OUTPUT_ROOT_DIR, doi_folder_name)
                os.makedirs(article_output_dir, exist_ok=True)

                # --- Save Fulltext ---
                fulltext_path = os.path.join(article_output_dir, "fulltext.txt")
                try:
                    with open(fulltext_path, "w", encoding="utf-8") as f:
                        f.write(f"Title: {article_data['title']}\n\n")
                        f.write(f"DOI: {article_data['doi']}\n\n")
                        f.write(f"Abstract:\n{article_data['abstract']}\n\n")
                        for section, paras in article_data["sections"].items():
                            f.write(f"\n=== {section} ===\n")
                            for para in paras:
                                f.write(f"{para}\n\n")
                    print(f"✅ Full text saved to: {fulltext_path}")
                except IOError as e:
                    print(f"Error saving full text: {e}")

                # --- Extract Tables ---
                table_info = extract_elsevier_tables_from_xml(xml_file_path, article_output_dir)

                # Renaming to desired format: table1.csv, table1_caption.txt, etc.
                for i, (caption_path, csv_path) in enumerate(table_info, 1):
                    table_csv_final = os.path.join(article_output_dir, f"table{i}.csv")
                    table_caption_final = os.path.join(article_output_dir, f"table{i}_caption.txt")

                    try:
                        os.rename(csv_path, table_csv_final)
                        os.rename(caption_path, table_caption_final)
                        print(f"✅ Table {i} saved as {table_csv_final} and caption as {table_caption_final}")
                    except OSError as e:
                        print(f"❌ Failed to rename table files: {e}")

            else:
                print(f"❌ Failed to extract article data or DOI from '{xml_filename}'. Skipping processing.")

        print("\n🎉 Processing complete.")


Found 5877 XML files in 'Elsevier_xml_data'. Starting processing...

Processing '10.1006_jssc.2000.8777.xml'...
✅ Full text saved to: elsevier_processed_articles\10.1006_jssc.2000.8777\fulltext.txt
   No tables successfully extracted from: Elsevier_xml_data\10.1006_jssc.2000.8777.xml

Processing '10.1006_jssc.2000.8804.xml'...
✅ Full text saved to: elsevier_processed_articles\10.1006_jssc.2000.8804\fulltext.txt
   No tables successfully extracted from: Elsevier_xml_data\10.1006_jssc.2000.8804.xml

Processing '10.1006_jssc.2002.9611.xml'...
✅ Full text saved to: elsevier_processed_articles\10.1006_jssc.2002.9611\fulltext.txt
   No tables successfully extracted from: Elsevier_xml_data\10.1006_jssc.2002.9611.xml

Processing '10.1016_B978-0-08-044965-4.50006-7.xml'...
✅ Full text saved to: elsevier_processed_articles\10.1016_B978-0-08-044965-4.50006-7\fulltext.txt
   Saved caption: elsevier_processed_articles\10.1016_B978-0-08-044965-4.50006-7\Table_3_caption.txt
   Saved table: elsevier_p

In [2]:
import os
import csv
import re
import unicodedata

# Define where your processed articles are stored
OUTPUT_ROOT_DIR = "_03"  # Change if needed

# === Utility for text normalization ===
def clean(text):
    """Clean and normalize text."""
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text.strip())
    return re.sub(r"\s+", " ", text)

# === Main caption cleaner ===
def clean_table_captions_in_jupyter(output_root_dir):
    """
    For each DOI subfolder, read tableX.csv and clean tableX_caption.txt
    by removing only exact CSV cell values from the caption.
    """
    for doi_folder in os.listdir(output_root_dir):
        folder_path = os.path.join(output_root_dir, doi_folder)
        if not os.path.isdir(folder_path):
            continue

        print(f"\n📂 Processing folder: {doi_folder}")

        for file in os.listdir(folder_path):
            if not file.endswith(".csv"):
                continue

            table_csv_path = os.path.join(folder_path, file)
            table_id = os.path.splitext(file)[0]
            caption_path = os.path.join(folder_path, f"{table_id}_caption.txt")

            if not os.path.exists(caption_path):
                print(f"⚠️ Caption missing for {table_id}")
                continue

            # Load CSV content
            with open(table_csv_path, "r", encoding="utf-8") as f:
                reader = csv.reader(f)
                rows = list(reader)

            # Load original caption
            with open(caption_path, "r", encoding="utf-8") as f:
                caption = f.read()

            original_caption = caption

            # Remove exact string matches (no heuristics)
            for row in rows:
                for cell in row:
                    cell = cell.strip()
                    if cell and cell in caption:
                        caption = caption.replace(cell, "")

            # Normalize whitespace (only)
            cleaned_caption = re.sub(r"\s+", " ", caption).strip()

            # Overwrite cleaned caption
            with open(caption_path, "w", encoding="utf-8") as f:
                f.write(cleaned_caption)

            print(f"✅ Cleaned: {file} — removed {len(original_caption) - len(cleaned_caption)} characters")

# ✅ Run this in Jupyter
clean_table_captions_in_jupyter(OUTPUT_ROOT_DIR)



📂 Processing folder: 10.1007s12205-017-0945-7

📂 Processing folder: 10.1016j.actamat.2019.07.031
✅ Cleaned: table1.csv — removed 0 characters

📂 Processing folder: 10.1016j.aiepr.2018.04.001

📂 Processing folder: 10.1016j.apenergy.2015.03.120
✅ Cleaned: table1.csv — removed 0 characters
✅ Cleaned: table2.csv — removed 0 characters
✅ Cleaned: table3.csv — removed 0 characters
✅ Cleaned: table4.csv — removed 0 characters
✅ Cleaned: table5.csv — removed 17 characters

📂 Processing folder: 10.1016j.apenergy.2017.11.004
✅ Cleaned: table1.csv — removed 3 characters
✅ Cleaned: table2.csv — removed 2 characters
✅ Cleaned: table3.csv — removed 0 characters

📂 Processing folder: 10.1016j.apenergy.2018.09.087
✅ Cleaned: table1.csv — removed 4 characters

📂 Processing folder: 10.1016j.apm.2015.09.044
✅ Cleaned: table1.csv — removed 14 characters
✅ Cleaned: table2.csv — removed 17 characters

📂 Processing folder: 10.1016j.apm.2020.08.024
✅ Cleaned: table1.csv — removed 0 characters
✅ Cleaned: tabl

In [4]:
import os
import re
import nltk
from nltk.tokenize import sent_tokenize
import tiktoken
nltk.download('punkt')
encoding = tiktoken.get_encoding("cl100k_base")
OUTPUT_ROOT_DIR = "_03"  # Change if needed
# Define patterns to retain (add your patterns here)
# === Patterns to keep ===
retain_patterns = [
    # Material name or formula
    r"\bmaterial[s]?\b", r"\bsample[s]?\b", r"\bcompound[s]?\b",
    r"\bceramic\b", r"\bbulk\b", r"\bpolycrystalline\b", r"\bsingle crystal\b", r"\bnanoparticle[s]?\b",
    r"\bthin film\b", r"\bpellet\b", r"\bingot[s]?\b", r"\bpowder[s]?\b",
    r"\bgrain boundary\b", r"\bgrain size\b", r"\bnanostructure\b",



    # Thermoelectric keywords
    r"\bZT\b", r"\bdimensionless figure of merit\b", r"\bfigure of merit\s*[:=]?\s*\d+(\.\d+)?\b",
    r"\bSeebeck coefficient\b", r"\bthermopower\b", r"\bS\s*=\s*[-+]?\d+(\.\d+)?\s*(μV/K|uV/K|V/K)\b",
    r"\belectrical conductivity\b", r"\bσ\s*=\s*[-+]?\d+(\.\d+)?\s*(S/m|Ω⁻¹m⁻¹)\b",
    r"\belectrical resistivity\b", r"\bρ\s*=\s*[-+]?\d+(\.\d+)?\s*(μΩ·cm|Ω·m|Ω·cm)\b",
    r"\bpower factor\b", r"\bPF\s*=\s*[-+]?\d+(\.\d+)?\s*(μW/cm·K²|mW/m·K²|W/m·K²)\b",
    r"\bthermal conductivity\b", r"\bκ\s*=\s*[-+]?\d+(\.\d+)?\s*(W/mK|W/m·K)\b",
    r"\blattice thermal conductivity\b", r"\belectronic thermal conductivity\b",
    r"\bHall coefficient\b", r"\bcarrier mobility\b", r"\bcarrier concentration\b",

    
    # Temperatures + units
    r"\b\d{2,4}\s*(K|°C|kelvin|degrees Celsius|Celsius)\b", r"\bat room temperature\b",
    r"\btemperature range\b", r"\bmeasured from .* to .* K\b", r"\bT\s*=\s*[0-9.]+K\b",
    r"\bincreasing temperature\b", r"\bhigh temperature region\b", r"\blow temperature behavior\b",r"\bRoom temperature\b",r"\bRT",

    
    # Structural descriptors
    r"\bspace group\b", r"\bSG\s*[:=]?\s*\w+\b", r"\bsymmetry\b",
    r"\bcrystal structure\b", r"\blattice structure\b", r"\bunit cell\b",
    r"\blattice constant[s]?\b", r"\blattice parameter[s]?\b",
    r"\ba\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bb\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bc\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b",
    r"\bangstrom\b", r"\bÅ\b",
    r"\bperovskite\b", r"\bskutterudite\b", r"\bzinc blende\b", r"\brhombohedral\b", r"\borthorhombic\b",
    r"\btetragonal\b", r"\bcubic\b", r"\bhexagonal\b", r"\btriclinic\b", r"\bmonoclinic\b", r"\blayered structure\b", r"\bquasi-one-dimensional\b", r"\bquasi-two-dimensional\b", r"\bquasi-three-dimensional\b",


    # Doping / composition
    r"\bdoping\b", r"\bdopant[s]?\b", r"\bsubstitution\b", r"\bsubstituted\b",
    r"\bdoped with\b", r"\bdop(ed|ing) sample\b", r"\bnominal composition\b",
    r"\bcarrier type\b", r"\bp-type\b", r"\bn-type\b", r"\bdegenerate\b", r"\bintrinsic\b",
    r"\bchemical formula\b", r"\bcomposition\b", r"\bstoichiometry\b",
    r"x\s*=\s*[0-9.]+", r"y\s*=\s*[0-9.]+", r"\bsolid solution\b", r"\balloy\b",
    #ELEMENT NAMES / COMMON DOPANTS
    r"\bAg\b", r"\bSb\b", r"\bBi\b", r"\bTe\b", r"\bSe\b", r"\bPb\b", r"\bNi\b", r"\bCo\b",
    r"\bDy\b", r"\bYb\b", r"\bRe\b", r"\bLa\b", r"\bPr\b", r"\bSm\b", r"\bEu\b", r"\bHo\b",
    r"\bCu\b", r"\bSn\b", r"\bMg\b", r"\bZn\b", r"\bMn\b", r"\bAl\b", r"\bFe\b", r"\bSi\b", r"\bGe\b", r"\bIn\b", r"\bGa\b", r"\bCd\b", r"\bHg\b", r"\bTl\b", r"\bBi\b", r"\bTe\b",
    r"\bSe\b", r"\bPb\b", r"\bNi\b",
    #EXPERIMENTAL METHODS (STRUCTURAL & THERMO)
    r"\bXRD\b", r"\bX-ray diffraction\b", r"\bdiffraction pattern\b", r"\bRietveld\b",
    r"\bSEM\b", r"\bscanning electron microscopy\b", r"\bFESEM\b",
    r"\bEDS\b", r"\bEDX\b", r"\bTEM\b", r"\btransmission electron microscopy\b",
    r"\bHall effect\b", r"\btransport measurement[s]?\b", r"\bthermal transport\b",
    r"\blaser flash\b", r"\b4-probe\b", r"\bspark plasma\b", r"\bmelt spinning\b", r"\barc melting\b", r"\bsintering\b",
    r"\bthermal analysis\b", r"\bDSC\b", r"\bTGA\b", r"\bDTA\b",
    #PHYSICAL MEASUREMENTS & PHENOMENA
    r"\bphonon scattering\b", r"\bgrain boundary scattering\b", r"\bbipolar conduction\b",
    r"\bdegenerate semiconductor\b", r"\bsemiconducting behavior\b", r"\bband gap\b", r"\bFermi level\b"

]
retain_re = [re.compile(pat, re.IGNORECASE) for pat in retain_patterns]

def is_material_related(sentence):
    return any(pat.search(sentence) for pat in retain_re)

def filter_material_sentences(text):
    sentences = sent_tokenize(text)
    return [s for s in sentences if is_material_related(s)]

# Process each DOI folder in 'elsevier_processed_articles'
for doi_folder in os.listdir(OUTPUT_ROOT_DIR):
    folder_path = os.path.join(OUTPUT_ROOT_DIR, doi_folder)
    if not os.path.isdir(folder_path):
        continue

    fulltext_path = os.path.join(folder_path, "fulltext.txt")
    if not os.path.exists(fulltext_path):
        print(f"⚠️ fulltext.txt missing in {doi_folder}")
        continue

    with open(fulltext_path, "r", encoding="utf-8") as f:
        text = f.read()

    filtered_sentences = filter_material_sentences(text)
    filtered_text = "\n".join(filtered_sentences)

    # Overwrite fulltext.txt with filtered sentences
    with open(fulltext_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Save filtered sentences separately (optional)
    filtered_path = os.path.join(folder_path, "filtered_material_sentences.txt")
    with open(filtered_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Count tokens using tiktoken
    tokens = encoding.encode(filtered_text)
    token_count = len(tokens)

    # Save token count
    token_count_path = os.path.join(folder_path, "token_count.txt")
    with open(token_count_path, "w", encoding="utf-8") as f:
        f.write(str(token_count))

    print(f"✅ {doi_folder}: {len(filtered_sentences)} sentences, {token_count} tokens saved and fulltext.txt updated.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ 10.1007s12205-017-0945-7: 4 sentences, 114 tokens saved and fulltext.txt updated.
✅ 10.1016j.actamat.2019.07.031: 173 sentences, 6605 tokens saved and fulltext.txt updated.
✅ 10.1016j.aiepr.2018.04.001: 77 sentences, 2695 tokens saved and fulltext.txt updated.
✅ 10.1016j.apenergy.2015.03.120: 170 sentences, 6531 tokens saved and fulltext.txt updated.
✅ 10.1016j.apenergy.2017.11.004: 187 sentences, 4684 tokens saved and fulltext.txt updated.
✅ 10.1016j.apenergy.2018.09.087: 147 sentences, 4049 tokens saved and fulltext.txt updated.
✅ 10.1016j.apm.2015.09.044: 274 sentences, 10586 tokens saved and fulltext.txt updated.
✅ 10.1016j.apm.2020.08.024: 115 sentences, 5276 tokens saved and fulltext.txt updated.
✅ 10.1016j.apmt.2020.100587: 103 sentences, 5044 tokens saved and fulltext.txt updated.
✅ 10.1016j.apsusc.2020.147513: 111 sentences, 3852 tokens saved and fulltext.txt updated.
✅ 10.1016j.arabjc.2018.01.021: 114 sentences, 4247 tokens saved and fulltext.txt updated.
✅ 10.1016j.bsecv.2

In [1]:
import os
import json
import unicodedata
import re
from lxml import etree
import csv

# === Constants ===
INPUT_XML_DIR = "elsevier_xml_outputs_01"
OUTPUT_ROOT_DIR = "_03"

ns = {
    'ce': "http://www.elsevier.com/xml/common/elssce",
    'xocs': "http://www.elsevier.com/xml/xocs/dtd",
    'dc': "http://purl.org/dc/elements/1.1/",
    'prism': "http://prismstandard.org/namespaces/basic/2.0/"
}

def clean(text):
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text.strip())
    return re.sub(r"\s+", " ", text)

def extract_texts(elem):
    return clean(' '.join(t.strip() for t in elem.itertext() if t.strip()))

def clean_caption_by_removing_row_text(caption, rows):
    caption_cleaned = caption
    for row in rows:
        row_text = " ".join(row)
        if row_text in caption_cleaned:
            caption_cleaned = caption_cleaned.replace(row_text, "")
    return clean(caption_cleaned)

def extract_elsevier_article(xml_file_path):
    try:
        tree = etree.parse(xml_file_path)
        root = tree.getroot()
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return None

    doi_element = root.find(".//prism:doi", namespaces=ns)
    doi = clean(doi_element.text) if doi_element is not None else "N/A"

    title_element = root.find(".//dc:title", namespaces=ns)
    title = clean(title_element.text) if title_element is not None else "N/A"

    abstract_element = root.find(".//dc:description", namespaces=ns)
    abstract = clean(abstract_element.text) if abstract_element is not None else "N/A"

    sections = {}
    current_section = "Introduction"
    sections[current_section] = []

    for elem in root.iter():
        tag = etree.QName(elem.tag).localname

        if tag == "section-title":
            section_title_text = extract_texts(elem)
            if section_title_text:
                current_section = section_title_text
                if current_section not in sections:
                    sections[current_section] = []

        elif tag == "para":
            para_text = extract_texts(elem)
            if para_text:
                sections.setdefault(current_section, []).append(para_text)
        elif tag == "abstract":
            if abstract == "N/A":
                abstract = extract_texts(elem)

    return {
        "doi": doi,
        "title": title,
        "abstract": abstract,
        "sections": sections
    }

def extract_elsevier_tables_from_xml(xml_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML file {xml_path}: {e}")
        return []

    caption_pattern = re.compile(r"\b(?:table|able)\s*\d+(?:[\.:])?", re.IGNORECASE)
    captions, tables_data, tables_xml = [], [], []
    current_caption = None
    current_table_rows = []
    skip_elem_id = None
    current_table_elem = None

    for elem in root.iter():
        text = extract_texts(elem)
        if caption_pattern.match(text) and len(text.split()) > 3:
            if current_caption and current_table_rows:
                captions.append(current_caption)
                tables_data.append(current_table_rows)
                tables_xml.append(current_table_elem)
            current_caption = text
            current_table_rows = []
            current_table_elem = None
            skip_elem_id = id(elem)
            continue
        if current_caption is not None and id(elem) != skip_elem_id:
            row_values = []
            for child in elem:
                tag = etree.QName(child.tag).localname.lower()
                if tag in {"entry", "td", "cell", "data"}:
                    value = extract_texts(child)
                    if value:
                        row_values.append(value)
            if row_values:
                if not any(re.search(r'\d', val) for val in row_values):
                    continue
                current_table_rows.append(row_values)
                if current_table_elem is None:
                    current_table_elem = elem # Save first parent element as table fragment

    if current_caption and current_table_rows:
        captions.append(current_caption)
        tables_data.append(current_table_rows)
        tables_xml.append(current_table_elem)

    saved_files_info = []
    for i, (caption, rows, table_xml) in enumerate(zip(captions, tables_data, tables_xml), 1):
        cleaned_caption = clean_caption_by_removing_row_text(caption, rows)
        match_id = re.search(r'\d+', caption)
        table_id = match_id.group(0) if match_id else str(i)
        tag = f"Table_{table_id}"
        tag = re.sub(r'[^\w\-.]', '', tag)

        caption_file = os.path.join(output_dir, f"{tag}_caption.txt")
        try:
            with open(caption_file, "w", encoding="utf-8") as f:
                f.write(cleaned_caption)
            print(f"   Saved caption: {caption_file}")
        except IOError as e:
            print(f"Error saving caption file {caption_file}: {e}")
            continue

        csv_file = os.path.join(output_dir, f"{tag}.csv")
        try:
            with open(csv_file, "w", encoding="utf-8", newline="") as f:
                writer = csv.writer(f)
                if rows:
                    writer.writerows(rows)
            print(f"   Saved table: {csv_file}")
        except IOError as e:
            print(f"Error saving CSV file {csv_file}: {e}")
            continue

        xml_file = os.path.join(output_dir, f"{tag}.xml")
        if table_xml is not None:
            try:
                # Write table fragment as XML
                table_tree = etree.ElementTree(table_xml)
                table_tree.write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
                print(f"   Saved table XML fragment: {xml_file}")
            except Exception as e:
                print(f"Error saving XML file {xml_file}: {e}")
        else:
            print(f"   No XML fragment found for table {tag}")

        saved_files_info.append((caption_file, csv_file, xml_file))

    if not saved_files_info:
        print(f"   No tables successfully extracted from: {xml_path}")
    return saved_files_info

if __name__ == "__main__":
    os.makedirs(INPUT_XML_DIR, exist_ok=True)
    os.makedirs(OUTPUT_ROOT_DIR, exist_ok=True)
    xml_files = [f for f in os.listdir(INPUT_XML_DIR) if f.endswith(".xml")]
    if not xml_files:
        print(f"No XML files found in '{INPUT_XML_DIR}'. Please place your XML files there.")
    else:
        print(f"Found {len(xml_files)} XML files in '{INPUT_XML_DIR}'. Starting processing...")
        for xml_filename in xml_files:
            xml_file_path = os.path.join(INPUT_XML_DIR, xml_filename)
            print(f"\nProcessing '{xml_filename}'...")
            article_data = extract_elsevier_article(xml_file_path)
            if article_data and article_data["doi"] != "N/A":
                doi_folder_name = re.sub(r'[^\w\-.]', '', article_data["doi"])
                article_output_dir = os.path.join(OUTPUT_ROOT_DIR, doi_folder_name)
                os.makedirs(article_output_dir, exist_ok=True)
                fulltext_path = os.path.join(article_output_dir, "fulltext.txt")
                try:
                    with open(fulltext_path, "w", encoding="utf-8") as f:
                        f.write(f"Title: {article_data['title']}\n\n")
                        f.write(f"DOI: {article_data['doi']}\n\n")
                        f.write(f"Abstract:\n{article_data['abstract']}\n\n")
                        for section, paras in article_data["sections"].items():
                            f.write(f"\n=== {section} ===\n")
                            for para in paras:
                                f.write(f"{para}\n\n")
                    print(f"✅ Full text saved to: {fulltext_path}")
                except IOError as e:
                    print(f"Error saving full text: {e}")
                table_info = extract_elsevier_tables_from_xml(xml_file_path, article_output_dir)
                for i, (caption_path, csv_path, xml_path) in enumerate(table_info, 1):
                    table_csv_final = os.path.join(article_output_dir, f"table{i}.csv")
                    table_caption_final = os.path.join(article_output_dir, f"table{i}_caption.txt")
                    table_xml_final = os.path.join(article_output_dir, f"table{i}.xml")
                    try:
                        os.rename(csv_path, table_csv_final)
                        os.rename(caption_path, table_caption_final)
                        if os.path.exists(xml_path):
                            os.rename(xml_path, table_xml_final)
                        print(f"✅ Table {i} saved as {table_csv_final}, {table_caption_final}, {table_xml_final}")
                    except OSError as e:
                        print(f"❌ Failed to rename table files: {e}")
            else:
                print(f"❌ Failed to extract article data or DOI from '{xml_filename}'. Skipping processing.")
        print("\n🎉 Processing complete.")

Found 168 XML files in 'elsevier_xml_outputs_01'. Starting processing...

Processing '10.1007_s12205-017-0945-7.xml'...
✅ Full text saved to: _03\10.1007s12205-017-0945-7\fulltext.txt
   No tables successfully extracted from: elsevier_xml_outputs_01\10.1007_s12205-017-0945-7.xml

Processing '10.1016_j.actamat.2019.07.031.xml'...
✅ Full text saved to: _03\10.1016j.actamat.2019.07.031\fulltext.txt
   Saved caption: _03\10.1016j.actamat.2019.07.031\Table_1_caption.txt
   Saved table: _03\10.1016j.actamat.2019.07.031\Table_1.csv
   Saved table XML fragment: _03\10.1016j.actamat.2019.07.031\Table_1.xml
✅ Table 1 saved as _03\10.1016j.actamat.2019.07.031\table1.csv, _03\10.1016j.actamat.2019.07.031\table1_caption.txt, _03\10.1016j.actamat.2019.07.031\table1.xml

Processing '10.1016_j.aiepr.2018.04.001.xml'...
✅ Full text saved to: _03\10.1016j.aiepr.2018.04.001\fulltext.txt
   No tables successfully extracted from: elsevier_xml_outputs_01\10.1016_j.aiepr.2018.04.001.xml

Processing '10.1016_

## Springer


In [11]:
import os

def truncate_fulltext_after_references(root_folder):
    for dirpath, _, filenames in os.walk(root_folder):
        if "fulltext.txt" in filenames:
            file_path = os.path.join(dirpath, "fulltext.txt")

            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()

            # Find index of the references marker (case-insensitive)
            cutoff_index = None
            for i, line in enumerate(lines):
                if line.strip().lower().startswith("===== references"):
                    cutoff_index = i
                    break

            # If found, truncate and overwrite the file
            if cutoff_index is not None:
                new_content = lines[:cutoff_index]
                with open(file_path, "w", encoding="utf-8") as f:
                    f.writelines(new_content)
                print(f"✂️ Truncated: {file_path}")
            else:
                print(f"✅ No references section found: {file_path}")

# Set your actual folder name here
target_folder = "filtered_abstracts"

# Run the function
truncate_fulltext_after_references(target_folder)
print("\n✅ Done processing all fulltext.txt files.")


✂️ Truncated: filtered_abstracts\10_1007_s00033_020_01311_x\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00033_020_1275_z\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00033_023_02155_x\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00158_022_03392_w\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00158_024_03897_6\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00161_022_01170_z\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00170_018_2240_2\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00170_022_10497_5\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00170_023_11263_x\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00170_023_12007_7\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00170_024_14042_4\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00231_018_2481_5\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00231_022_03280_5\fulltext.txt
✂️ Truncated: filtered_abstracts\10_1007_s00289_022_04

In [14]:
import os
import re
import nltk
from nltk.tokenize import sent_tokenize
import tiktoken
nltk.download('punkt')
encoding = tiktoken.get_encoding("cl100k_base")
OUTPUT_ROOT_DIR ="filtered_abstracts" # Change if needed
# Define patterns to retain (add your patterns here)
# === Patterns to keep ===
retain_patterns = [
    # Material name or formula
    r"\bmaterial[s]?\b", r"\bsample[s]?\b", r"\bcompound[s]?\b",
    r"\bceramic\b", r"\bbulk\b", r"\bpolycrystalline\b", r"\bsingle crystal\b", r"\bnanoparticle[s]?\b",
    r"\bthin film\b", r"\bpellet\b", r"\bingot[s]?\b", r"\bpowder[s]?\b",
    r"\bgrain boundary\b", r"\bgrain size\b", r"\bnanostructure\b",



    # Thermoelectric keywords
    r"\bZT\b", r"\bdimensionless figure of merit\b", r"\bfigure of merit\s*[:=]?\s*\d+(\.\d+)?\b",
    r"\bSeebeck coefficient\b", r"\bthermopower\b", r"\bS\s*=\s*[-+]?\d+(\.\d+)?\s*(μV/K|uV/K|V/K)\b",
    r"\belectrical conductivity\b", r"\bσ\s*=\s*[-+]?\d+(\.\d+)?\s*(S/m|Ω⁻¹m⁻¹)\b",
    r"\belectrical resistivity\b", r"\bρ\s*=\s*[-+]?\d+(\.\d+)?\s*(μΩ·cm|Ω·m|Ω·cm)\b",
    r"\bpower factor\b", r"\bPF\s*=\s*[-+]?\d+(\.\d+)?\s*(μW/cm·K²|mW/m·K²|W/m·K²)\b",
    r"\bthermal conductivity\b", r"\bκ\s*=\s*[-+]?\d+(\.\d+)?\s*(W/mK|W/m·K)\b",
    r"\blattice thermal conductivity\b", r"\belectronic thermal conductivity\b",
    r"\bHall coefficient\b", r"\bcarrier mobility\b", r"\bcarrier concentration\b",

    
    # Temperatures + units
    r"\b\d{2,4}\s*(K|°C|kelvin|degrees Celsius|Celsius)\b", r"\bat room temperature\b",
    r"\btemperature range\b", r"\bmeasured from .* to .* K\b", r"\bT\s*=\s*[0-9.]+K\b",
    r"\bincreasing temperature\b", r"\bhigh temperature region\b", r"\blow temperature behavior\b",r"\bRoom temperature\b",r"\bRT",

    
    # Structural descriptors
    r"\bspace group\b", r"\bSG\s*[:=]?\s*\w+\b", r"\bsymmetry\b",
    r"\bcrystal structure\b", r"\blattice structure\b", r"\bunit cell\b",
    r"\blattice constant[s]?\b", r"\blattice parameter[s]?\b",
    r"\ba\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bb\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bc\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b",
    r"\bangstrom\b", r"\bÅ\b",
    r"\bperovskite\b", r"\bskutterudite\b", r"\bzinc blende\b", r"\brhombohedral\b", r"\borthorhombic\b",
    r"\btetragonal\b", r"\bcubic\b", r"\bhexagonal\b", r"\btriclinic\b", r"\bmonoclinic\b", r"\blayered structure\b", r"\bquasi-one-dimensional\b", r"\bquasi-two-dimensional\b", r"\bquasi-three-dimensional\b",


    # Doping / composition
    r"\bdoping\b", r"\bdopant[s]?\b", r"\bsubstitution\b", r"\bsubstituted\b",
    r"\bdoped with\b", r"\bdop(ed|ing) sample\b", r"\bnominal composition\b",
    r"\bcarrier type\b", r"\bp-type\b", r"\bn-type\b", r"\bdegenerate\b", r"\bintrinsic\b",
    r"\bchemical formula\b", r"\bcomposition\b", r"\bstoichiometry\b",
    r"x\s*=\s*[0-9.]+", r"y\s*=\s*[0-9.]+", r"\bsolid solution\b", r"\balloy\b",
    #ELEMENT NAMES / COMMON DOPANTS
    r"\bAg\b", r"\bSb\b", r"\bBi\b", r"\bTe\b", r"\bSe\b", r"\bPb\b", r"\bNi\b", r"\bCo\b",
    r"\bDy\b", r"\bYb\b", r"\bRe\b", r"\bLa\b", r"\bPr\b", r"\bSm\b", r"\bEu\b", r"\bHo\b",
    r"\bCu\b", r"\bSn\b", r"\bMg\b", r"\bZn\b", r"\bMn\b", r"\bAl\b", r"\bFe\b", r"\bSi\b", r"\bGe\b", r"\bIn\b", r"\bGa\b", r"\bCd\b", r"\bHg\b", r"\bTl\b", r"\bBi\b", r"\bTe\b",
    r"\bSe\b", r"\bPb\b", r"\bNi\b",
    #EXPERIMENTAL METHODS (STRUCTURAL & THERMO)
    r"\bXRD\b", r"\bX-ray diffraction\b", r"\bdiffraction pattern\b", r"\bRietveld\b",
    r"\bSEM\b", r"\bscanning electron microscopy\b", r"\bFESEM\b",
    r"\bEDS\b", r"\bEDX\b", r"\bTEM\b", r"\btransmission electron microscopy\b",
    r"\bHall effect\b", r"\btransport measurement[s]?\b", r"\bthermal transport\b",
    r"\blaser flash\b", r"\b4-probe\b", r"\bspark plasma\b", r"\bmelt spinning\b", r"\barc melting\b", r"\bsintering\b",
    r"\bthermal analysis\b", r"\bDSC\b", r"\bTGA\b", r"\bDTA\b",
    #PHYSICAL MEASUREMENTS & PHENOMENA
    r"\bphonon scattering\b", r"\bgrain boundary scattering\b", r"\bbipolar conduction\b",
    r"\bdegenerate semiconductor\b", r"\bsemiconducting behavior\b", r"\bband gap\b", r"\bFermi level\b"

]
retain_re = [re.compile(pat, re.IGNORECASE) for pat in retain_patterns]

def is_material_related(sentence):
    return any(pat.search(sentence) for pat in retain_re)

def filter_material_sentences(text):
    sentences = sent_tokenize(text)
    return [s for s in sentences if is_material_related(s)]

# Process each DOI folder in 'elsevier_processed_articles'
for doi_folder in os.listdir(OUTPUT_ROOT_DIR):
    folder_path = os.path.join(OUTPUT_ROOT_DIR, doi_folder)
    if not os.path.isdir(folder_path):
        continue

    fulltext_path = os.path.join(folder_path, "fulltext.txt")
    if not os.path.exists(fulltext_path):
        print(f"⚠️ fulltext.txt missing in {doi_folder}")
        continue

    with open(fulltext_path, "r", encoding="utf-8") as f:
        text = f.read()

    filtered_sentences = filter_material_sentences(text)
    filtered_text = "\n".join(filtered_sentences)

    # Overwrite fulltext.txt with filtered sentences
    with open(fulltext_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Save filtered sentences separately (optional)
    filtered_path = os.path.join(folder_path, "filtered_material_sentences.txt")
    with open(filtered_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Count tokens using tiktoken
    tokens = encoding.encode(filtered_text)
    token_count = len(tokens)

    # Save token count
    token_count_path = os.path.join(folder_path, "token_count.txt")
    with open(token_count_path, "w", encoding="utf-8") as f:
        f.write(str(token_count))

    print(f"✅ {doi_folder}: {len(filtered_sentences)} sentences, {token_count} tokens saved and fulltext.txt updated.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ 10_1007_s00033_020_01311_x: 105 sentences, 4749 tokens saved and fulltext.txt updated.
✅ 10_1007_s00033_020_1275_z: 88 sentences, 4424 tokens saved and fulltext.txt updated.
✅ 10_1007_s00033_023_02155_x: 82 sentences, 3434 tokens saved and fulltext.txt updated.
✅ 10_1007_s00158_022_03392_w: 150 sentences, 4897 tokens saved and fulltext.txt updated.
✅ 10_1007_s00158_024_03897_6: 210 sentences, 7417 tokens saved and fulltext.txt updated.
✅ 10_1007_s00161_022_01170_z: 151 sentences, 6390 tokens saved and fulltext.txt updated.
✅ 10_1007_s00170_018_2240_2: 4 sentences, 142 tokens saved and fulltext.txt updated.
✅ 10_1007_s00170_022_10497_5: 143 sentences, 3758 tokens saved and fulltext.txt updated.
✅ 10_1007_s00170_023_11263_x: 188 sentences, 5065 tokens saved and fulltext.txt updated.
✅ 10_1007_s00170_023_12007_7: 379 sentences, 11141 tokens saved and fulltext.txt updated.
✅ 10_1007_s00170_024_14042_4: 161 sentences, 5707 tokens saved and fulltext.txt updated.
✅ 10_1007_s00231_018_2481_5

In [5]:
import os
import tiktoken

# Use OpenAI's tiktoken encoding for GPT-3.5/4
encoding = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(encoding.encode(text))

OUTPUT_ROOT_DIR = "Nature_articles_Thermoelectric"

total_token_count = 0

for doi_folder in os.listdir(OUTPUT_ROOT_DIR):
    folder_path = os.path.join(OUTPUT_ROOT_DIR, doi_folder)
    if not os.path.isdir(folder_path):
        continue

    fulltext_path = os.path.join(folder_path, "filtered_material_sentences.txt")
    if not os.path.exists(fulltext_path):
        continue

    with open(fulltext_path, "r", encoding="utf-8") as f:
        text = f.read()
        total_token_count += count_tokens(text)

print(f"Total token count across all folders: {total_token_count / 1_000_000:.2f} million")

Total token count across all folders: 4.12 million


In [15]:
import os
import pandas as pd
import re

def convert_html_tables_to_csv(root_folder):
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if re.match(r'table_\d+\.html$', filename):
                html_path = os.path.join(dirpath, filename)
                table_num = re.findall(r'\d+', filename)[0]
                csv_filename = f"table_{table_num}.csv"
                csv_path = os.path.join(dirpath, csv_filename)

                try:
                    # Read all tables from the HTML file (usually only one per file)
                    tables = pd.read_html(html_path)
                    if tables:
                        tables[0].to_csv(csv_path, index=False)
                        print(f"✅ Converted {html_path} → {csv_path}")
                    else:
                        print(f"⚠️ No tables found in: {html_path}")
                except Exception as e:
                    print(f"❌ Error reading {html_path}: {e}")

# Set the folder path
target_folder = "filtered_abstracts"

# Run the function
convert_html_tables_to_csv(target_folder)
print("\n✅ All tables processed.")


✅ Converted filtered_abstracts\10_1007_s00033_020_1275_z\table_1.html → filtered_abstracts\10_1007_s00033_020_1275_z\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_1.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_2.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_2.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_3.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_3.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_1.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_2.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_2.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_3.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_3.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_4.html → filtered_abs

In [16]:
import os
import pandas as pd
import re

def convert_html_tables_to_csv(root_folder):
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if re.match(r'table_\d+\.html$', filename):
                html_path = os.path.join(dirpath, filename)
                table_num = re.findall(r'\d+', filename)[0]
                csv_filename = f"table_{table_num}.csv"
                csv_path = os.path.join(dirpath, csv_filename)

                try:
                    # Read all tables from the HTML file (usually only one per file)
                    tables = pd.read_html(html_path)
                    if tables:
                        tables[0].to_csv(csv_path, index=False)
                        print(f"✅ Converted {html_path} → {csv_path}")
                    else:
                        print(f"⚠️ No tables found in: {html_path}")
                except Exception as e:
                    print(f"❌ Error reading {html_path}: {e}")

# Set the folder path
target_folder = "filtered_abstracts"

# Run the function
convert_html_tables_to_csv(target_folder)
print("\n✅ All tables processed.")


✅ Converted filtered_abstracts\10_1007_s00033_020_1275_z\table_1.html → filtered_abstracts\10_1007_s00033_020_1275_z\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_1.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_2.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_2.csv
✅ Converted filtered_abstracts\10_1007_s00158_024_03897_6\table_3.html → filtered_abstracts\10_1007_s00158_024_03897_6\table_3.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_1.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_1.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_2.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_2.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_3.html → filtered_abstracts\10_1007_s00161_022_01170_z\table_3.csv
✅ Converted filtered_abstracts\10_1007_s00161_022_01170_z\table_4.html → filtered_abs

## RSC Preprocessing

In [9]:
import pandas as pd
import os
from pathlib import Path

# Set your root directory (change this if needed)
root_dir = Path("RSC_thermoelectric_materials")

# Loop through each subfolder
for folder in root_dir.iterdir():
    if folder.is_dir():
        print(f"📂 Processing folder: {folder.name}")
        
        # Loop through all CSVs in the folder
        for csv_file in folder.glob("*.csv"):
            print(f"  📄 Reading: {csv_file.name}")
            
            try:
                df = pd.read_csv(csv_file)
                
                # Drop columns with <=2 non-null entries
                filtered_df = df.dropna(axis=1, thresh=3)

                # Overwrite the original file
                filtered_df.to_csv(csv_file, index=False)
                print(f"  ✅ Cleaned and saved: {csv_file.name}")

            except Exception as e:
                print(f"  ❌ Failed to process {csv_file.name}: {e}")


📂 Processing folder: 10_1039_a602506d
📂 Processing folder: 10_1039_a904413b
📂 Processing folder: 10_1039_a908670f
  📄 Reading: table1.csv
  ✅ Cleaned and saved: table1.csv
  📄 Reading: table2.csv
  ✅ Cleaned and saved: table2.csv
  📄 Reading: table3.csv
  ✅ Cleaned and saved: table3.csv
  📄 Reading: table4.csv
  ✅ Cleaned and saved: table4.csv
  📄 Reading: table5.csv
  ✅ Cleaned and saved: table5.csv
  📄 Reading: table6.csv
  ✅ Cleaned and saved: table6.csv
  📄 Reading: table7.csv
  ✅ Cleaned and saved: table7.csv
📂 Processing folder: 10_1039_b000045k
  📄 Reading: table1.csv
  ✅ Cleaned and saved: table1.csv
  📄 Reading: table2.csv
  ✅ Cleaned and saved: table2.csv
  📄 Reading: table3.csv
  ✅ Cleaned and saved: table3.csv
📂 Processing folder: 10_1039_b002696o
📂 Processing folder: 10_1039_b102987h
📂 Processing folder: 10_1039_b106812a
  📄 Reading: table1.csv
  ✅ Cleaned and saved: table1.csv
  📄 Reading: table2.csv
  ✅ Cleaned and saved: table2.csv
  📄 Reading: table3.csv
  ✅ Cleaned an

In [10]:
import os
import re
import nltk
from nltk.tokenize import sent_tokenize
import tiktoken
nltk.download('punkt')
encoding = tiktoken.get_encoding("cl100k_base")
OUTPUT_ROOT_DIR ="RSC_thermoelectric_materials" # Change if needed
# Define patterns to retain (add your patterns here)
# === Patterns to keep ===
retain_patterns = [
    # Material name or formula
    r"\bmaterial[s]?\b", r"\bsample[s]?\b", r"\bcompound[s]?\b",
    r"\bceramic\b", r"\bbulk\b", r"\bpolycrystalline\b", r"\bsingle crystal\b", r"\bnanoparticle[s]?\b",
    r"\bthin film\b", r"\bpellet\b", r"\bingot[s]?\b", r"\bpowder[s]?\b",
    r"\bgrain boundary\b", r"\bgrain size\b", r"\bnanostructure\b",



    # Thermoelectric keywords
    r"\bZT\b", r"\bdimensionless figure of merit\b", r"\bfigure of merit\s*[:=]?\s*\d+(\.\d+)?\b",
    r"\bSeebeck coefficient\b", r"\bthermopower\b", r"\bS\s*=\s*[-+]?\d+(\.\d+)?\s*(μV/K|uV/K|V/K)\b",
    r"\belectrical conductivity\b", r"\bσ\s*=\s*[-+]?\d+(\.\d+)?\s*(S/m|Ω⁻¹m⁻¹)\b",
    r"\belectrical resistivity\b", r"\bρ\s*=\s*[-+]?\d+(\.\d+)?\s*(μΩ·cm|Ω·m|Ω·cm)\b",
    r"\bpower factor\b", r"\bPF\s*=\s*[-+]?\d+(\.\d+)?\s*(μW/cm·K²|mW/m·K²|W/m·K²)\b",
    r"\bthermal conductivity\b", r"\bκ\s*=\s*[-+]?\d+(\.\d+)?\s*(W/mK|W/m·K)\b",
    r"\blattice thermal conductivity\b", r"\belectronic thermal conductivity\b",
    r"\bHall coefficient\b", r"\bcarrier mobility\b", r"\bcarrier concentration\b",

    
    # Temperatures + units
    r"\b\d{2,4}\s*(K|°C|kelvin|degrees Celsius|Celsius)\b", r"\bat room temperature\b",
    r"\btemperature range\b", r"\bmeasured from .* to .* K\b", r"\bT\s*=\s*[0-9.]+K\b",
    r"\bincreasing temperature\b", r"\bhigh temperature region\b", r"\blow temperature behavior\b",r"\bRoom temperature\b",r"\bRT",

    
    # Structural descriptors
    r"\bspace group\b", r"\bSG\s*[:=]?\s*\w+\b", r"\bsymmetry\b",
    r"\bcrystal structure\b", r"\blattice structure\b", r"\bunit cell\b",
    r"\blattice constant[s]?\b", r"\blattice parameter[s]?\b",
    r"\ba\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bb\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b", r"\bc\s*=\s*\d+(\.\d+)?\s*(Å|angstrom|nm)\b",
    r"\bangstrom\b", r"\bÅ\b",
    r"\bperovskite\b", r"\bskutterudite\b", r"\bzinc blende\b", r"\brhombohedral\b", r"\borthorhombic\b",
    r"\btetragonal\b", r"\bcubic\b", r"\bhexagonal\b", r"\btriclinic\b", r"\bmonoclinic\b", r"\blayered structure\b", r"\bquasi-one-dimensional\b", r"\bquasi-two-dimensional\b", r"\bquasi-three-dimensional\b",


    # Doping / composition
    r"\bdoping\b", r"\bdopant[s]?\b", r"\bsubstitution\b", r"\bsubstituted\b",
    r"\bdoped with\b", r"\bdop(ed|ing) sample\b", r"\bnominal composition\b",
    r"\bcarrier type\b", r"\bp-type\b", r"\bn-type\b", r"\bdegenerate\b", r"\bintrinsic\b",
    r"\bchemical formula\b", r"\bcomposition\b", r"\bstoichiometry\b",
    r"x\s*=\s*[0-9.]+", r"y\s*=\s*[0-9.]+", r"\bsolid solution\b", r"\balloy\b",
    #ELEMENT NAMES / COMMON DOPANTS
    r"\bAg\b", r"\bSb\b", r"\bBi\b", r"\bTe\b", r"\bSe\b", r"\bPb\b", r"\bNi\b", r"\bCo\b",
    r"\bDy\b", r"\bYb\b", r"\bRe\b", r"\bLa\b", r"\bPr\b", r"\bSm\b", r"\bEu\b", r"\bHo\b",
    r"\bCu\b", r"\bSn\b", r"\bMg\b", r"\bZn\b", r"\bMn\b", r"\bAl\b", r"\bFe\b", r"\bSi\b", r"\bGe\b", r"\bIn\b", r"\bGa\b", r"\bCd\b", r"\bHg\b", r"\bTl\b", r"\bBi\b", r"\bTe\b",
    r"\bSe\b", r"\bPb\b", r"\bNi\b",
    #EXPERIMENTAL METHODS (STRUCTURAL & THERMO)
    r"\bXRD\b", r"\bX-ray diffraction\b", r"\bdiffraction pattern\b", r"\bRietveld\b",
    r"\bSEM\b", r"\bscanning electron microscopy\b", r"\bFESEM\b",
    r"\bEDS\b", r"\bEDX\b", r"\bTEM\b", r"\btransmission electron microscopy\b",
    r"\bHall effect\b", r"\btransport measurement[s]?\b", r"\bthermal transport\b",
    r"\blaser flash\b", r"\b4-probe\b", r"\bspark plasma\b", r"\bmelt spinning\b", r"\barc melting\b", r"\bsintering\b",
    r"\bthermal analysis\b", r"\bDSC\b", r"\bTGA\b", r"\bDTA\b",
    #PHYSICAL MEASUREMENTS & PHENOMENA
    r"\bphonon scattering\b", r"\bgrain boundary scattering\b", r"\bbipolar conduction\b",
    r"\bdegenerate semiconductor\b", r"\bsemiconducting behavior\b", r"\bband gap\b", r"\bFermi level\b"

]
retain_re = [re.compile(pat, re.IGNORECASE) for pat in retain_patterns]

def is_material_related(sentence):
    return any(pat.search(sentence) for pat in retain_re)

def filter_material_sentences(text):
    sentences = sent_tokenize(text)
    return [s for s in sentences if is_material_related(s)]

# Process each DOI folder in 'elsevier_processed_articles'
for doi_folder in os.listdir(OUTPUT_ROOT_DIR):
    folder_path = os.path.join(OUTPUT_ROOT_DIR, doi_folder)
    if not os.path.isdir(folder_path):
        continue

    fulltext_path = os.path.join(folder_path, "fulltext.txt")
    if not os.path.exists(fulltext_path):
        print(f"⚠️ fulltext.txt missing in {doi_folder}")
        continue

    with open(fulltext_path, "r", encoding="utf-8") as f:
        text = f.read()

    filtered_sentences = filter_material_sentences(text)
    filtered_text = "\n".join(filtered_sentences)

    # Overwrite fulltext.txt with filtered sentences
    with open(fulltext_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Save filtered sentences separately (optional)
    filtered_path = os.path.join(folder_path, "filtered_material_sentences.txt")
    with open(filtered_path, "w", encoding="utf-8") as f:
        f.write(filtered_text)

    # Count tokens using tiktoken
    tokens = encoding.encode(filtered_text)
    token_count = len(tokens)

    # Save token count
    token_count_path = os.path.join(folder_path, "token_count.txt")
    with open(token_count_path, "w", encoding="utf-8") as f:
        f.write(str(token_count))

    print(f"✅ {doi_folder}: {len(filtered_sentences)} sentences, {token_count} tokens saved and fulltext.txt updated.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ 10_1039_a602506d: 12 sentences, 498 tokens saved and fulltext.txt updated.
✅ 10_1039_a904413b: 1 sentences, 159 tokens saved and fulltext.txt updated.
✅ 10_1039_a908670f: 311 sentences, 14928 tokens saved and fulltext.txt updated.
✅ 10_1039_b000045k: 101 sentences, 4456 tokens saved and fulltext.txt updated.
✅ 10_1039_b002696o: 537 sentences, 20855 tokens saved and fulltext.txt updated.
✅ 10_1039_b102987h: 463 sentences, 17663 tokens saved and fulltext.txt updated.
✅ 10_1039_b106812a: 93 sentences, 3709 tokens saved and fulltext.txt updated.
✅ 10_1039_b107468g: 128 sentences, 5547 tokens saved and fulltext.txt updated.
✅ 10_1039_b108377p: 154 sentences, 6549 tokens saved and fulltext.txt updated.
✅ 10_1039_b201162j: 254 sentences, 9682 tokens saved and fulltext.txt updated.
✅ 10_1039_b202043b: 11 sentences, 1013 tokens saved and fulltext.txt updated.
✅ 10_1039_b301758n: 104 sentences, 4563 tokens saved and fulltext.txt updated.
✅ 10_1039_b304295b: 1 sentences, 67 tokens saved and ful

## See word cloud

In [1]:
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def collect_all_fulltexts(root_dirs):
    all_text = ""
    for root_dir in root_dirs:
        for dirpath, _, filenames in os.walk(root_dir):
            if "filtered_material_sentences.txt" in filenames:
                file_path = os.path.join(dirpath, "filtered_material_sentences.txt")
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()
                        all_text += content + "\n"
                except Exception as e:
                    print(f"❌ Failed to read {file_path}: {e}")
    return all_text

# Directories to scan
source_dirs = ["elsevier_processed_articles", "Nature_articles_Thermoelectric"]

def clean_text(text, custom_stopwords):
    # Lowercase and remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    # Remove custom stopwords
    words = text.split()
    cleaned_words = [word for word in words if word not in custom_stopwords]
    return " ".join(cleaned_words)

# Step 1: Collect fulltext from folders
source_dirs = ["elsevier_processed_articles", "Nature_articles_Thermoelectric"]
raw_text = collect_all_fulltexts(source_dirs)

# Step 2: Define words to remove (case-insensitive, lowercase only)
custom_stopwords = set([
    "et", "al", "fig", "figure", "table", "data", "copyright","introduction", "results"
])

# Step 3: Clean the text
cleaned_text = clean_text(raw_text, custom_stopwords)

# Step 4: Generate WordCloud
wordcloud = WordCloud(
    width=800, height=400,
    background_color='white',
    colormap='tab10',
    max_words=100
).generate(cleaned_text)

# Step 5: Plot WordCloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
#plt.title("TF-IDF Word Cloud from Thermoelectric Papers", fontsize=14)
plt.show()

KeyboardInterrupt: 

## Data Redundency After preprocessing