# Préparation

In [1]:
# Installations, imports

!pip install langchain --quiet
!pip install langchain_community --quiet
!pip install transformers --quiet
!pip install accelerate --quiet
!pip install nltk --quiet


# Required Libraries
import torch
import transformers
from transformers import AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import json
from tqdm import tqdm
from pathlib import Path
import os
import time

# Find the best available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # Use GPU if available, otherwise use CPU

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import et traitement data Paleosaurus

import pandas as pd

df = pd.read_csv('Paleosaurus_ISTEX.csv')

text_contents = []
for idx,row in df.iterrows():
    text_contents.append(f"Title: {row['title']}\nAbstract: {row['abstract']}\n")

#print ([text_contents[a] for a in range(10)])

htfl = pd.DataFrame(text_contents, columns=['text'])
htfl


# SmoLM

In [3]:
# Load the model
model_identifier = 'HuggingFaceTB/SmolLM2-1.7B-Instruct'
llm = pipeline(model=model_identifier, device=device)

Device set to use cuda


In [4]:
# Definition de prompt pour SmolLM
def SmolLM_chat_def(doc) :
    chat = [
                    {
                        'role':'system',
                        'content':'''As an excellent automatic term extraction (ATE) system, extract the terms in the Paleoclimatology domain given the following text delimited. Named entities are not considered as terms.
                        Separate terms with commas. Ensure each term is from the Paleoclimatology domain, each term represents a main topic from the document, and provide no additional information.
                        Make sure you only return the terms and say nothing else. For example, dont say: "Sure, Id be happy to help! Based on the information provided in the document".
    
                        Output Format: [list of terms present]
                        If no terms are presented, keep it empty list: [].'''
                    },
                    {'role':'user', 'content': 'The Hydroclimate and Environmental Response to Middle Miocene Warming in the Southwestern USA: Stable Isotope Evidence .'},
                    {'role':'assistant', 'content': '[Hydroclimate, Environmental Response, Middle Miocene, Warming, Stable Isotope Evidence]'},
                    {'role':'user', 'content': 'Late Pleistocene Sediment Provenance and Paleoenvironmental Changes in the East Siberian Shelf Margin: Insights From Mineralogical and Nd Isotope Analysis .'},
                    {'role':'assistant', 'content': '[Late Pleistocene, Sediment Provenance, Paleoenvironmental Changes, East Siberian Shelf Margin, Mineralogical Analysis, Nd Isotope Analysis]'},
                    {'role':'user', 'content': 'Moreover , there is yet to be established a common consensus being used in current assays .'},
                    {'role':'assistant', 'content': '[]'},
                    {'role':'user', 'content': doc}
                ]
    return chat

In [5]:
%%time

chat = SmolLM_chat_def(htfl["text"][1])

prompt = llm.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    
# Generate prediction
generation = llm(prompt, max_new_tokens = 2000)

new_text = generation[0]['generated_text'][len(prompt):]
print(new_text)

[K-Ar dating, Age, King George Island, West Antarctica, Stratigraphy, Early Eocene, Arctowski Interglacial, Paleocene, Oligocene-Miocene Boundary, Late Cretaceous, Paleocene, Late Eocene-Early Oligocene, Early Miocene, Wawel Interglacial, Legru Glaciation, Polonez Glaciation, Wesele Interglacial, Early Miocene glaciation, Late Cretaceous, Cretaceous]
CPU times: user 4 s, sys: 124 ms, total: 4.13 s
Wall time: 4.14 s


# Llama

In [None]:
# Chargement modèle et préparation
from huggingface_hub import login

token = "your-token-here"

login(token=token)

# Define model and tokenizer
model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_new_tokens = 256,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
) #max_length=2000,

llm2 = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0})

#Definition extracteur des résultats
def extract_after_last_inst(text):
    # This splits after the last instruction and extracts only the final output
    segments = text.split("[/INST]")
    if len(segments) > 1:
        return segments[-1].strip().replace("</s>", "").strip()
    return text.strip()

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.23s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


In [7]:
PROMPT = """
                <s>
                [INST]
                <<SYS>>
                As an excellent automatic term extraction (ATE) system, extract the terms in the Paleoclimatology domain given the following text delimited. Named entities are not considered as terms.
                    Separate terms with commas. Ensure each term is from the Paleoclimatology domain, each term represents a main topic from the document, and provide no additional information.
                    Make sure you only return the terms and say nothing else. For example, dont say: "Sure, Id be happy to help! Based on the information provided in the document".

                    Output Format: [list of terms present]
                    If no terms are presented, keep it empty list: [].


                Examples of the output format:
                <</SYS>>
                Sentence: ```The Hydroclimate and Environmental Response to Middle Miocene Warming in the Southwestern USA: Stable Isotope Evidence .```
                Domain: Paleoclimatology
                [/INST]
                Output: "[Hydroclimate, Environmental Response, Middle Miocene, Warming, Stable Isotope Evidence]"
                </s>

                <s>
                [INST]
                Sentence: ```Late Pleistocene Sediment Provenance and Paleoenvironmental Changes in the East Siberian Shelf Margin: Insights From Mineralogical and Nd Isotope Analysis  .```
                Domain: Paleoclimatology
                [/INST]
                Output: "[Late Pleistocene, Sediment Provenance, Paleoenvironmental Changes, East Siberian Shelf Margin, Mineralogical Analysis, Nd Isotope Analysis]"
                </s>

                <s>
                [INST]
                Sentence: ```Moreover , there is yet to be established a common consensus being used in current assays .```
                Domain: Paleoclimatology
                [/INST]
                Output: "[]"
                </s>

                <s>
                [INST]
                Sentence: ```{text}```
                Domain: Paleoclimatology
                [/INST]
                """

# Create the LangChain prompt and chain
template = PROMPT
prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm2)

In [8]:
%%time

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Execution

output_raw = llm_chain.run({"text": htfl["text"][1]})
output_clean = extract_after_last_inst(output_raw)

print(output_clean)

Output: ["Late Cretaceous", "early Tertary", "King George Island", "West Antarctica", "stratigraphic distribution", "palaeoclimatic significance", "K-Ar dating", "Late Cretaceous", "Paleocene", "Eocene", "Kraków Glaciation", "Arctowski Interglacial", "Oligocene", "Miocene", "Wawel Interglacial", "Polonez Glaciation", "Wesele Interglacial", "Legru Glaciation", "Melville Glaciation"]
CPU times: user 3min 46s, sys: 3.38 s, total: 3min 49s
Wall time: 3min 49s
