# Outlines

https://github.com/outlines-dev/outlines \
https://outlines-dev.github.io/outlines/

# content

* model is mistral-7b-instruct-v0.2.Q6_K.gguf
* model_mistral is mistralai/Mistral-7B-v0.1 

NOTE: for correct generation with mistral-7b-instruct-v0.2.Q6_K.gguf need:
* in generate.json(whitespace_pattern="") - to help smaller LLMs
* in generator(max_tokens=1000) - so json is not intrupted before fully generated

# conclusions
---
---


In [1]:
from outlines import models, generate
from llama_cpp import Llama

In [2]:
llm = Llama("/home/dorota/models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_gpu_layers=10, n_ctx=0, verbose=False)
model = models.LlamaCpp(llm) 

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4070 Ti, compute capability 8.9, VMM: yes


In [8]:
model_mistral = models.transformers("mistralai/Mistral-7B-v0.1") # device='cuda'

#TODO: how to run on GPU???????

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.62s/it]


In [3]:


from pydantic import BaseModel, Field
class User(BaseModel):
    first_name: str
    last_name: str
    id: int

generator = generate.json(model, User, whitespace_pattern="")

result = generator(
    """Based on user information create a user profile with the fields first_name, last_name.
    User information is: Jane Doe 123""", max_tokens=1000
)

print(result)

  from .autonotebook import tqdm as notebook_tqdm


first_name='`Jane`' last_name='`Doe`' id=123


In [None]:
from pydantic import BaseModel
from outlines import models, generate


class User(BaseModel):
    name: str
    last_name: str
    id: int

generator = generate.json(model_mistral, User)
result = generator(
    "Create a user profile with the fields name, last_name and id.  User information is: Jane Doe 123"
)
print(dict(result))

In [None]:
dict(result)

with Pydantic and own text

In [12]:
from pypdf import PdfReader 
  
reader = PdfReader('/home/dorota/LLM-diploma-project/00_concept_tests/data/40001_2023_Article_1364.pdf') 
num_pages = len(reader.pages)
TEXT = ""
for page_num in range(num_pages): #change to range(num_pages) for whole document
    page = reader.pages[page_num]  
    TEXT += page.extract_text()

In [13]:
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: str = Field(..., description="select one value from provided examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'])
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

generator = generate.json(model, Metadata, whitespace_pattern="")
prompt = """
                Article is delimited by (start) and (stop):
                (start)
                {{TEXT}}
                (stop)
                Generate output based on the Article corresponding to the Metadata class
                """
                
result = generator(prompt, max_tokens=5000)

dict(result)

#---------------------------------------------------------------------------------
# with model_mistral
# NOTE output has correc headings but content is not based on article provided
# {'title': 'Document title',
 #'authors': 'John J. Nixon, John W. Inplace,David R. None, and Stephen M. S. oninc',
 #'pub_year': 2016,
 #'key_words': 'text',
 #'summary': 'This is a rjohnsntroducttory document how to use and develop OpenCadflow environment',
 #'research_area': 'science.agriculture',
 #'quality': 'submitted_document',
 #'quality_reason': 'Copy from other publications without changes'}

 #--------------------------------------------------------------------------------
 # with model
 #{'title': '',
 #'authors': '',
 #'pub_year': 2000,
 #'key_words': '',
 #'summary': '',
 #'research_area': '',
 #'quality': '',
 #'quality_reason': ''}
 

{'title': '',
 'authors': '',
 'pub_year': 2015,
 'key_words': '',
 'summary': '',
 'research_area': '',
 'quality': 'A',
 'quality_reason': 'Well-written, logically organized, and the data is supportable and presented adequately. The article also includes a substantial new contribution to the field. '}

with prompt where field name and description are manually inserted into the prompt from Pydantic class

In [14]:
import outlines
import outlines.generate
import outlines.generate.json
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: str = Field(..., description="select one value from provided examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'])
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

@outlines.prompt
def get_metadata(text):
    """Article is delimited by (start) and (stop):
    (start)
    {{text}}
    (stop)
    Based on the article content, generate fields listed below according to their descriptions. 
    title: description="extract title from article"
    authors: description="extract authors from article"
    pub_year: description="extract publication year"
    key_words: description="generate 5 new key words based on content in Abstract"
    summary: description="generate summary in 3 sentences"
    research_area: description="generate 1 main research area described in article"
    quality: description="select one value from ['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'] to define quality of article"
    quality_reason: description="describe reason for chosen quality_score in 1 sentece"
    """
prompt = get_metadata(TEXT)
generator = outlines.generate.json(model, Metadata, whitespace_pattern="")
result = generator(prompt, max_tokens=5000)

result.model_dump()

#------------------------------------------------------------------------------------------------------------------------------
# NOTE with model_mistral good quality output (just 1st page used)
#{'title': 'Visualization of breast cancer-related protein synthesis from  the perspective   of bibliometric analysis ',
# 'authors': 'Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun , Tenghua Yu*',
# 'pub_year': 2023,
# 'key_words': 'breast cancer, cancer, protein, expression, translation',
# 'summary': 'The Our analysis of the relationship between protein expression in breast cancer and the  development and  treatment of tumors.',
# 'research_area': 'oncology, hematology',
# 'quality': 'EXCELLENT',
# 'quality_reason': 'Presentation of content is good. '}

#----------------------------------------------------------------------------------------------------------------------------
# NOTE with model partially satisfactory output (10s, just 1st page used)
#{'title': ': ',
# 'authors': ': [{',
# 'pub_year': 2023,
# 'key_words': ': [',
# 'summary': ': This article presents a bibliometric analysis of the literature on breast cancer and protein synthesis, uncovering trends and key topics in the field. The research reveals the burgeoning interest in understanding the relationship between protein expression and breast cancer development and treatment. Key findings include the increase in publications since 2003 and the dominance of research in oncology and biology journals.',
# 'research_area': ': Breast cancer and protein synthesis bibliometrics',
# 'quality': ': GOOD',
# 'quality_reason': ': Detailed analysis and well-written'}

#----------------------------------------------------------------------------------------------------------------------------
# NOTE with model partially satisfactory output (2 min for whole article)
#{'title': ': ',
# 'authors': ': [',
# 'pub_year': 2023,
# 'key_words': ', breast cancer, bibliometric analysis, protein synthesis, breast cancer-related protein synthesis',
# 'summary': ': This study conducted a bibliometric analysis of breast cancer-related protein synthesis literature from 2003 to 2022 using data obtained from the Web of Science Core Collection. The analysis revealed an increasing trend in the number of publications over time, with the majority being published in oncology or biology-related journals. The most influential journals included the Journal of Biological Chemistry, Cancer Research, and Proceedings of the National Academy of Sciences of the United States of America. Significant authors and collaborative networks were identified, and keywords identified the main research areas of breast cancer, protein synthesis, and breast cancer-related protein synthesis.',
# 'research_area': ': Breast cancer-related protein synthesis',
# 'quality': ': EXCELLENT',
# 'quality_reason': ': The article provides a clear and detailed analysis of breast cancer-related protein synthesis literature using bibliometric tools and techniques, resulting in valuable insights for researchers and clinical practitioners in the field.'}

{'title': ': ',
 'authors': ': [',
 'pub_year': 2023,
 'key_words': ', breast cancer, bibliometric analysis, protein synthesis, breast cancer-related protein synthesis',
 'summary': ': This study conducted a bibliometric analysis of breast cancer-related protein synthesis literature from 2003 to 2022 using data obtained from the Web of Science Core Collection. The analysis revealed an increasing trend in the number of publications over time, with the majority being published in oncology or biology-related journals. The most influential journals included the Journal of Biological Chemistry, Cancer Research, and Proceedings of the National Academy of Sciences of the United States of America. Significant authors and collaborative networks were identified, and keywords identified the main research areas of breast cancer, protein synthesis, and breast cancer-related protein synthesis.',
 'research_area': ': Breast cancer-related protein synthesis',
 'quality': ': EXCELLENT',
 'quality_reaso

with prompt with Pydantc class schema

In [11]:
# https://outlines-dev.github.io/outlines/reference/prompting/

import outlines
import outlines.generate
import outlines.generate.json
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: str = Field(..., description="select one value from examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY']) # NOTE examples not included in propt
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

@outlines.prompt
def get_metadata(text, pydantic_model):
    """
    Article is delimited by (start) and (stop):
    (start)
    {{text}}
    (stop)
    Based on the article content, generate fields listed below according to their descriptions. 
    {{ pydantic_model | schema }}
    """

prompt = get_metadata(TEXT, Metadata)
generator = outlines.generate.json(model, Metadata, whitespace_pattern="")
result = generator(prompt, max_tokens=5000)

result.model_dump()

#---------------------------------------------------------------------------------------------
# TODO NOTE: with model_mistral the output is different with  {{ pydantic_model | schema }} and manually inserted fields from example above even though printed prompt looks the same
#{'title': 'Breast cancer and protein synthesis from the perspective of bibliometric analysis', # NOTE-> not correct
# 'authors': 'Xu, X., Yu, C., Zeng, X., Tang, W., Xu, S., Tang, L., Huang, Y., Sun, Z. and Yu, T.',  # NOTE -> first name intitials only
# 'pub_year': 2023,
# 'key_words': 'Breast cancer; gene expression; c-myc and bcl-xL; chart; Eastern Blot; Oncogene; Molecular Cell Biology; hypoxia; Cell Division; Cancer research', # NOTE -> strange and not 5 words
# 'summary': "This article presents a bibliometric analysis of research studies related to breast cancer and protein synthesis, using the Web of Science database for the period of 2003 to 2022. Over 2900 articles were retrieved containing the keywords 'breast cancer' and 'protein synthesis' in the title, abstract, and keywords. The number of publications related to breast cancer and protein synthesis has increased steadily, and the most prominent research topics include the relationship between protein expression and tumor development and treatment, the regulation of protein synthesis by hypoxia, the mechanisms and advances related to various oncogenes, c-myc, and bcl-xl, the interactions between cells and cancer cells, and the functions of various proteins in cancer-affected pathways. Notable journals associated with breast cancer and protein synthesis include the Journal of Biological Chemistry, Cancer Research, the Proceedings of the National Academy of Sciences of the United States of America, and Oncogene.",
# 'research_area': 'Oncology',
# 'quality': 'good',
# 'quality_reason': 'The article uses a well-established database and is well-written and referenced.'}

#--------------------------------------------------------------------------------------------------
# NOTE: with model partially satisfactory output
#{'title': '',
# 'authors': '',
# 'pub_year': 2023,
# 'key_words': '[{\\',
# 'summary': '',
# 'research_area': 'Breast cancer research',
# 'quality': 'High',
# 'quality_reason': 'Meets all the standards for a high-quality article, including having a clear and concise abstract, using appropriate and relevant research keywords, and reporting accurate and reliable research results'}

# TODO: detta example med få med examples

{'title': '',
 'authors': '',
 'pub_year': 2023,
 'key_words': '[{\\',
 'summary': '',
 'research_area': 'Breast cancer research',
 'quality': 'High',
 'quality_reason': 'Meets all the standards for a high-quality article, including having a clear and concise abstract, using appropriate and relevant research keywords, and reporting accurate and reliable research results'}

In [12]:
# TODO: result -> pydantic

---
---

In [None]:
Metadata.model_fields['quality'].examples

INSPIRATION

In [None]:
generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"])
generator = generate.text(model) # does not work with Pydantic
generator = generate.regex(model, r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",)
generator = generate.format(model, int)

result = generator(prompt, max_tokens=10)

#-----------------------------------------------------------------------------------------------
def add(a: int, b: int):
    return a + b

model = outlines.models.transformers("WizardLM/WizardMath-7B-V1.1")
generator = outlines.generate.json(model, add)
result = generator("Return json with two integers named a and b respectively. a is odd and b even.")

print(add(**result))
# 3

# -----------------------------------------------------------------------------------------------
import outlines

examples = [
    ("The food was disgusting", "Negative"),
    ("We had a fantastic night", "Positive"),
    ("Recommended", "Positive"),
    ("The waiter was rude", "Negative")
]

@outlines.prompt
def labelling(to_label, examples):
    """You are a sentiment-labelling assistant.

    {% for example in examples %}
    {{ example[0] }} // {{ example[1] }}
    {% endfor %}
    {{ to_label }} //
    """

model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
prompt = labelling("Just awesome", examples)
answer = outlines.generate.text(model)(prompt, max_tokens=100)