# Outlines

https://github.com/outlines-dev/outlines \
https://outlines-dev.github.io/outlines/

### CONTENT:
Investigation of Outlines for extraction of structured output from text such as scientific articles.

* model_mistral_Q6 is mistral-7b-instruct-v0.2.Q6_K.gguf
* model_mistral is mistralai/Mistral-7B-v0.1
* model_phi3 is Phi-3-mini-128k-instruct

### RESULTS/COMMENTS:
* description for each field is needed in prompt. Just a general instruction to the model results in halucinated results.
* compatible with Pydantic
* unstable output even for low temperature, seldomly perfect output
* output ~25 s basen on front page of article and 1min - 2 min 30 s basen on full article for model_mistral_Q6
* simple query syntax
* well maintained
* for correct generation with mistral-7b-instruct-v0.2.Q6_K.gguf need:
    * in generate.json(whitespace_pattern="") - to help smaller LLMs
    * in generator(max_tokens=1000) - so json is not intrupted before fully generated; llama_cpp i class Llama create_completion(max_token=16, temperature = 0.8, echo=False, ...) https://github.com/abetlen/llama-cpp-python/blob/2138561fab5e60672c63b6c446b62a8bc26e17c4/llama_cpp/llama.py#L1431
* default temperature is 0.8 with llama-cpp-python unless other is stated

---
---


In [1]:
from outlines import models, generate
from llama_cpp import Llama

## Models

In [None]:
llm = Llama("/home/dorota/models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_gpu_layers=10, n_ctx=0, verbose=False)
model_mistral_Q6 = models.LlamaCpp(llm) 

In [2]:
model_mistral = models.transformers("mistralai/Mistral-7B-v0.1")#, device='auto')

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

max_memory_mapping = {0: "8GB"}

llm = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", device_map='auto', trust_remote_code=True, max_memory = max_memory_mapping)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
model_phi3 = models.Transformers(llm, tokenizer)

## Tutorial examples

In [None]:
from pydantic import BaseModel, Field
from outlines import models, generate

class User(BaseModel):
    first_name: str
    last_name: str
    id: int

generator = generate.json(model_mistral_Q6, User, whitespace_pattern="")

result = generator(
    """Based on user information create a user profile with the fields first_name, last_name.
    User information is: Jane Doe 123""", max_tokens=1000
)

print(result)

## Pydantic and article text as input

In [7]:
from pypdf import PdfReader 
  
reader = PdfReader('/home/dorota/LLM-diploma-project/00_concept_tests/data/40001_2023_Article_1364.pdf') 
num_pages = len(reader.pages)
TEXT = ""
for page_num in range(1): #change to range(num_pages) for whole document
    page = reader.pages[page_num]  
    TEXT += page.extract_text()

with own text and a general instruction in prompt

In [None]:
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: str = Field(..., description="select one value from provided examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'])
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

generator = generate.json(model_mistral_Q6, Metadata, whitespace_pattern="")
prompt = """
                Article is delimited by (start) and (stop):
                (start)
                {{TEXT}}
                (stop)
                Generate output based on the Article corresponding to the Metadata class
                """
                
result = generator(prompt, max_tokens=5000)

dict(result)

#---------------------------------------------------------------------------------
# with model_mistral
# NOTE output has correc headings but content is not based on article provided
# {'title': 'Document title',
 #'authors': 'John J. Nixon, John W. Inplace,David R. None, and Stephen M. S. oninc',
 #'pub_year': 2016,
 #'key_words': 'text',
 #'summary': 'This is a rjohnsntroducttory document how to use and develop OpenCadflow environment',
 #'research_area': 'science.agriculture',
 #'quality': 'submitted_document',
 #'quality_reason': 'Copy from other publications without changes'}

 #--------------------------------------------------------------------------------
 # with model_mistral_Q6
#{'title': '',
# 'authors': '',
# 'pub_year': 2015,
# 'key_words': '',
# 'summary': '',
# 'research_area': '',
# 'quality': 'A',
# 'quality_reason': 'Well-written, logically organized, and the data is supportable and presented adequately. The article also includes a substantial new contribution to the field. '}
 

with prompt where field name and description are manually inserted into the prompt from Pydantic class

In [None]:
import outlines
import outlines.generate
import outlines.generate.json
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: str = Field(..., description="select one value from provided examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'])
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

@outlines.prompt
def get_metadata(text):
    """Article is delimited by (start) and (stop):
    (start)
    {{text}}
    (stop)
    Based on the article content, generate fields listed below according to their descriptions. 
    title: description="extract title from article"
    authors: description="extract authors from article"
    pub_year: description="extract publication year"
    key_words: description="generate 5 new key words based on content in Abstract"
    summary: description="generate summary in 3 sentences"
    research_area: description="generate 1 main research area described in article"
    quality: description="select one value from ['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'] to define quality of article"
    quality_reason: description="describe reason for chosen quality_score in 1 sentece"
    """
prompt = get_metadata(TEXT)
generator = outlines.generate.json(model_mistral_Q6, Metadata, whitespace_pattern="")
result = generator(prompt, max_tokens=5000)

result.model_dump()

#------------------------------------------------------------------------------------------------------------------------------
# NOTE with model_mistral good quality output (just 1st page used)
#{'title': 'Visualization of breast cancer-related protein synthesis from  the perspective   of bibliometric analysis ',
# 'authors': 'Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun , Tenghua Yu*',
# 'pub_year': 2023,
# 'key_words': 'breast cancer, cancer, protein, expression, translation',
# 'summary': 'The Our analysis of the relationship between protein expression in breast cancer and the  development and  treatment of tumors.',
# 'research_area': 'oncology, hematology',
# 'quality': 'EXCELLENT',
# 'quality_reason': 'Presentation of content is good. '}

#----------------------------------------------------------------------------------------------------------------------------
# NOTE with model_mistral_Q6 partially satisfactory output (10s, just 1st page used)
#{'title': ': ',
# 'authors': ': [{',
# 'pub_year': 2023,
# 'key_words': ': [', 
# 'summary': ': This article presents a bibliometric analysis of the literature on breast cancer and protein synthesis, uncovering trends and key topics in the field. The research reveals the burgeoning interest in understanding the relationship between protein expression and breast cancer development and treatment. Key findings include the increase in publications since 2003 and the dominance of research in oncology and biology journals.',
# 'research_area': ': Breast cancer and protein synthesis bibliometrics',
# 'quality': ': GOOD',
# 'quality_reason': ': Detailed analysis and well-written'}

#----------------------------------------------------------------------------------------------------------------------------
# NOTE with model_mistral_Q6 partially satisfactory output (2 min for whole article)
#{'title': ': ',
# 'authors': ': [',
# 'pub_year': 2023,
# 'key_words': ', breast cancer, bibliometric analysis, protein synthesis, breast cancer-related protein synthesis',
# 'summary': ': This study conducted a bibliometric analysis of breast cancer-related protein synthesis literature from 2003 to 2022 using data obtained from the Web of Science Core Collection. The analysis revealed an increasing trend in the number of publications over time, with the majority being published in oncology or biology-related journals. The most influential journals included the Journal of Biological Chemistry, Cancer Research, and Proceedings of the National Academy of Sciences of the United States of America. Significant authors and collaborative networks were identified, and keywords identified the main research areas of breast cancer, protein synthesis, and breast cancer-related protein synthesis.',
# 'research_area': ': Breast cancer-related protein synthesis',
# 'quality': ': EXCELLENT',
# 'quality_reason': ': The article provides a clear and detailed analysis of breast cancer-related protein synthesis literature using bibliometric tools and techniques, resulting in valuable insights for researchers and clinical practitioners in the field.'}

with prompt with Pydantc class schema

In [None]:
# https://outlines-dev.github.io/outlines/reference/prompting/

import outlines
import outlines.generate
import outlines.generate.json
from pydantic import BaseModel, Field
from typing import List

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: List[str] = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: List[str] = Field(..., description="generate 5 new key words based on content in Abstract", min_length=5, max_length=5)
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    # quality: str = Field(..., description="select one value from examples to define quality of article", examples=['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY']) # NOTE examples not included in propt
    quality: str = Field(..., description="select one value from examples ['GOOD', 'BAD', 'EXCELLENT', 'CAN NOT SET QUALITY'] to define quality of article")
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

@outlines.prompt
def get_metadata(text, pydantic_model):
    """
    Article is delimited by (start) and (stop):
    (start)
    {{text}}
    (stop)
    Based on the article content, generate fields listed below according to their descriptions. 
    {{ pydantic_model | schema }}
    """

prompt = get_metadata(TEXT, Metadata)
generator = outlines.generate.json(model_mistral_Q6, Metadata, whitespace_pattern="")
result = generator(prompt, max_tokens=5000, temperature = 0.1)

result.model_dump()

#---------------------------------------------------------------------------------------------
# NOTE: with model_mistral the output is different with  {{ pydantic_model | schema }} and manually inserted fields from example above even though printed prompt looks the same (just 1st page used)
#{'title': 'Breast cancer and protein synthesis from the perspective of bibliometric analysis', # NOTE-> not correct
# 'authors': 'Xu, X., Yu, C., Zeng, X., Tang, W., Xu, S., Tang, L., Huang, Y., Sun, Z. and Yu, T.',  # NOTE -> first name intitials only
# 'pub_year': 2023,
# 'key_words': 'Breast cancer; gene expression; c-myc and bcl-xL; chart; Eastern Blot; Oncogene; Molecular Cell Biology; hypoxia; Cell Division; Cancer research', # NOTE -> strange and not 5 words
# 'summary': "This article presents a bibliometric analysis of research studies related to breast cancer and protein synthesis, using the Web of Science database for the period of 2003 to 2022. Over 2900 articles were retrieved containing the keywords 'breast cancer' and 'protein synthesis' in the title, abstract, and keywords. The number of publications related to breast cancer and protein synthesis has increased steadily, and the most prominent research topics include the relationship between protein expression and tumor development and treatment, the regulation of protein synthesis by hypoxia, the mechanisms and advances related to various oncogenes, c-myc, and bcl-xl, the interactions between cells and cancer cells, and the functions of various proteins in cancer-affected pathways. Notable journals associated with breast cancer and protein synthesis include the Journal of Biological Chemistry, Cancer Research, the Proceedings of the National Academy of Sciences of the United States of America, and Oncogene.",
# 'research_area': 'Oncology',
# 'quality': 'good',
# 'quality_reason': 'The article uses a well-established database and is well-written and referenced.'}

#--------------------------------------------------------------------------------------------------
# NOTE: with model_mistral_Q6 partially satisfactory output (just 1st page used)
#{'title': '',
# 'authors': '',
# 'pub_year': 2023,
# 'key_words': '[{\\',
# 'summary': '',
# 'research_area': 'Breast cancer research',
# 'quality': 'High',
# 'quality_reason': 'Meets all the standards for a high-quality article, including having a clear and concise abstract, using appropriate and relevant research keywords, and reporting accurate and reliable research results'}

#--------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, just 1st page used, temperature set to 0.1 -> perfect output (~25 s), but can vary
#{'title': ': Visualization of breast cancer -related protein synthesis from the perspective of bibliometric analysis',
# 'authors': ': Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun, Tenghua Yu',
# 'pub_year': 2023,
# 'key_words': ': breast cancer, bibliometric analysis, protein synthesis, expression, translation',
# 'summary': ": This article provides insights into the research on breast cancer and protein synthesis through bibliometric analysis. The number of publications in this area has steadily increased, with most articles published in oncology or biology-related journals. Keyword analysis revealed that 'breast cancer,' 'expression,' 'cancer,' 'protein,' and 'translation' were the most commonly researched topics. The focus of the research is on the relationship between protein expression in breast cancer and tumor development and treatment.",
# 'research_area': ': Oncology and Biology',
# 'quality': ': High',
# 'quality_reason': ': The article is based on a comprehensive bibliometric analysis of a large dataset of articles, providing valuable insights into the current state of research on breast cancer and protein synthesis.'}

#-----------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, just 1st page used, temperature = 0.3, quality: examples included in prompt string -> mostly correct output
#{'title': 'Visualization of breast cancer -related protein synthesis from the perspective of bibliometric analysis',
# 'authors': 'Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun, Tenghua Yu',
# 'pub_year': 2023,
# 'key_words': 'breast cancer, bibliometric analysis, protein synthesis, expression, cancer, translation, diagnosis, treatment, biology, research, therapy',
# 'summary': "This article provides insights into the research on breast cancer and protein synthesis by analyzing articles published between 2003 and 2022 in the Web of Science database. The number of publications in this area has been steadily increasing, with most articles published in oncology or biology-related journals. Keyword analysis revealed that 'breast cancer', 'expression', 'cancer', 'protein', and 'translation' were the most commonly researched topics. The research primarily focuses on the relationship between protein expression in breast cancer and tumor development and treatment, yielding essential insights into the biology of breast cancer and the genesis of cutting-edge therapies.",
# 'research_area': 'Breast Cancer Research',
# 'quality': 'GOOD',
# 'quality_reason': 'The article is well-written, provides a clear research question, and presents relevant and accurate data through bibliometric analysis.'}

#--------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, full article, temperature set to 0.1 or 0.5, quality: examples included in prompt string-> partially satisfactory output (120 s)
#{'title': ': ',
# 'authors': ': [',
# 'pub_year': 2023,
# 'key_words': '][',
# 'summary': ': This study aimed to investigate the current status of research on breast cancer-related protein synthesis by conducting a bibliometric analysis of the literature. The analysis revealed a steady increase in the number of publications related to breast cancer and protein synthesis, with a significant surge observed after 2003. The majority of the articles were published in oncology or biology-related journals, with the most publications in Journal of Biological Chemistry, Cancer Research, Proceedings of the National Academy of Sciences of the United States of America, and Oncogene. Keyword analysis revealed that “breast cancer,” “expression,” “cancer,” “protein,” and “translation” were the most commonly researched topics. The study underscores the burgeoning interest in this research and its importance in the diagnosis and treatment of breast cancer.',
# 'research_area': ': breast cancer research',
# 'quality': ': GOOD',
# 'quality_reason': ': The article provides a comprehensive analysis of the current state of research on breast cancer-related protein synthesis, using bibliometric analysis to identify trends and key areas of focus.'}

#--------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, full article, temperature set to 0.1, quality: examples included in prompt string, List[str] for authors and key_words -> all correct output except for title (2 min 47 s)
#{'title': ': ',
# 'authors': ['Jiawei Xu',  'Chengdong Yu',  'Xiaoqiang Zeng',  'Weifeng Tang',  'Si-yi Xu',  'Lei Tang',  'Yanxiao Huang',  'Zhengkui Sun',  'Tenghua Yu'],
# 'pub_year': 2023,
# 'key_words': ['Breast cancer',  'Bibliometric analysis',  'Protein synthesis',  'Breast cancer-related protein synthesis',  'Breast cancer research'],
# 'summary': ': This study aimed to identify the importance of breast cancer-related protein synthesis and address research questions related to publication output, influential journals, top institutions and countries, prominent authors, and key areas of focus. The authors conducted a bibliometric analysis of breast cancer-related protein synthesis research using the Web of Science Core Collection database from 2003 to 2022. They found a steady increase in the number of publications, with a significant surge after 2003. The most influential journals were the Journal of Biological Chemistry, Cancer Research, Proceedings of the National Academy of Sciences of the United States of America, and Oncogene. The top institutions were McGill University, Harvard University, and the National Cancer Institute, and the top countries were the United States, China, and Canada. The most prominent authors were Schneider, Robert J, Sonenberg Nahum, and Ramon y Cajal Santiago. The key areas of focus were synthesis and regulatory mechanisms of breast cancer-related proteins, protein synthesis and cellular processes in breast cancer progression and metastasis, and apoptotic regulation and related cancer research in breast cancer cells.',
# 'research_area': ': Medical Research, specifically in the field of breast cancer and protein synthesis.',
# 'quality': ': EXCELLENT',
# 'quality_reason': ': The article provides a comprehensive analysis of breast cancer-related protein synthesis research using bibliometric analysis. The study covers various aspects of the research, including publication output, influential journals, top institutions and countries, prominent authors, and key areas of focus. The findings are presented in a clear and concise manner, making it an excellent resource for researchers and scholars in the field.'}


with prompt using model_json_schema to include "examples" via QualityEnum

In [None]:
import outlines
import outlines.generate
import outlines.generate.json
from pydantic import BaseModel, Field
from enum import Enum
from typing import List
import json

class QualityEnum(str, Enum):
    good = 'GOOD'
    bad = 'BAD'
    excellent = 'EXCELLENT'
    can_not_set_quality = 'CAN NOT SET QUALITY'

class Metadata(BaseModel):
    title: str = Field(..., description="extract title from article")
    authors: List[str] = Field(..., description="extract authors from article")
    # authors: str = Field(..., description="extract authors from article")
    pub_year: int = Field(..., description="extract publication year")
    key_words: List[str] = Field(..., description="generate 5 new key words based on content in Abstract", min_length=5, max_length=5)
    # key_words: str = Field(..., description="generate 5 new key words based on content in Abstract")
    summary: str = Field(..., description="generate summary in 3 sentences")
    research_area: str = Field(..., description="generate 1 main research area described in article")
    quality: QualityEnum
    quality_reason: str = Field(..., description="describe reason for chosen quality_score in 1 sentece")

@outlines.prompt
def get_metadata(text):
    """
    Article is delimited by (start) and (stop):
    (start)
    {{text}}
    (stop)
    Based on the article content, generate fields listed below according to their descriptions. 
    """

prompt = get_metadata(TEXT)
generator = outlines.generate.json(model_mistral, json.dumps(Metadata.model_json_schema()), whitespace_pattern="")
result = generator(prompt, max_tokens=5000)# , temperature = 0.1) # type result is dict

result_pydantic = Metadata(**result)
result_pydantic.model_dump()

#-----------------------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, TEXT is only front page, temperature = 0.1 -> unstable output (1 min)
# exemple 1:
#{'title': ': ',
# 'authors': ['Jiawei Xu', 'Chengdong Yu', 'Xiaoqiang Zeng', 'Weifeng Tang', 'Siyi Xu', 'Lei Tang', 'Yanxiao Huang', 'Zhengkui Sun', 'Tenghua Yu'],
# 'pub_year': 2023,
# 'key_words': ['Breast cancer', 'Bibliometric analysis',  'Protein synthesis',  'Introduction',  'Abstract'],
# 'summary': ": The article provides an overview of the research on breast cancer and protein synthesis using bibliometric analysis. The authors searched the Web of Science database for articles containing the keywords 'breast cancer' and 'protein synthesis' between 2003 and 2022. They analyzed the publication output, citation counts, co-citation analysis, and keyword analysis of the articles. The results showed that the number of publications in this area has been increasing, with most articles published in oncology or biology-related journals. The most common topics of research were 'breast cancer,' 'expression,' 'cancer,' 'protein,' and 'translation.' The authors concluded that the research on breast cancer and protein synthesis is of great importance for understanding the development and treatment of breast cancer.",
# 'research_area': ': The research area is breast cancer and protein synthesis.',
# 'quality': 'BAD',
# 'quality_reason': ": The article is a research article and the authors have provided a clear objective, used appropriate research methods, and presented the results in a logical and coherent manner. However, the article lacks a clear hypothesis or theoretical framework, and the discussion could be more comprehensive. Therefore, the quality is assessed as 'Below Average'."}
#
# example 2:
#{'title': 'Visualization of breast cancer -related protein synthesis from the perspective of bibliometric analysis',
# 'authors': ['Jiawei Xu', 'Chengdong Yu','Xiaoqiang Zeng','Weifeng Tang', 'Siyi Xu', 'Lei Tang', 'Yanxiao Huang', 'Zhengkui Sun', 'Tenghua Yu'],
# 'pub_year': 2023,
# 'key_words': ['Breast cancer, Bibliometric analysis, Protein synthesis', 'Introduction:'
#  'Breast cancer is the most common cancer in women worldwide and the number of patients increased year by year. It is a complex disease that can be caused by a variety of factors, including genetic mutations, hormonal imbalances, and lifestyle choices. One of the key factors in the development and progression of breast cancer is the overexpression of certain proteins. In this article, we undertake a bibliometric analysis of the literature on breast cancer and protein synthesis, aiming to provide crucial insights into this esoteric realm of investigation. Our approach was to scour the Web of Science database, between 2003 and 2022, for articles containing the keywords “breast cancer” and “protein synthesis” in their title, abstract, or keywords. We deployed bibliometric analysis software, exploring a range of measures such as publication output, citation counts, co-citation analysis, and keyword analysis. Our search yielded 2998 articles that met our inclusion criteria. The number of publications in this area has steadily increased, with a significant rise observed after 2003. Most of the articles were published in oncology or biology-related journals, with the most publications in Journal of Biological Chemistry, Cancer Research, Proceedings of the National Academy of Sciences of the United States of America, and Oncogene. Keyword analysis revealed that “breast cancer,” “expression,” “cancer,” “protein,” and “translation” were the most commonly researched topics. In conclusion, our bibliometric analysis of breast cancer and related protein synthesis literature underscores the burgeoning interest in this research. The focus of the research is primarily on the relationship between protein expression in breast cancer and the development and treatment of tumors. These studies have been instrumental in the diagnosis and treatment of breast cancer. Sustained research in this area will yield essential insights into the biology of breast cancer and the genesis of cutting-edge therapies.',
#  'abstract_text_shortened_for_summary_purposes_only_if_needed_else_leave_blank_or_null_value_for_this_field_:',
#  'The article provides a bibliometric analysis of the literature on breast cancer and protein synthesis, aiming to provide insights into this research area. The authors searched the Web of Science database for articles containing the keywords “breast cancer” and “protein synthesis” between 2003 and 2022. They found 2998 articles that met their inclusion criteria and analyzed the publication output, citation counts, co-citation analysis, and keyword analysis. The number of publications in this area has steadily increased, with a significant rise observed after 2003. The most common topics researched were “breast cancer,” “expression,” “cancer,” “protein,” and “translation.” The focus of the research is on the relationship between protein expression in breast cancer and tumor development and treatment. The findings of this research have been instrumental in the diagnosis and treatment of breast cancer, and sustained research in this area will yield essential insights into the biology of breast cancer and the development of cutting-edge therapies. '],
# 'summary': '',
# 'research_area': '',
# 'quality': 'BAD', OR 'quality': <QualityEnum.bad: 'BAD'>,
# 'quality_reason': ''}

#---------------------------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, TEXT is only front page, temperature = 0.1 str instead of List[str] for authors and key_words -> partially good output (25s)
#{'title': 'Visualization of breast cancer -related protein synthesis from the perspective of bibliometric analysis',
# 'authors': 'Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun, Tenghua Yu',
# 'pub_year': 2023,
# 'key_words': 'Breast cancer, Bibliometric analysis, Protein synthesis',
# 'summary': '',
# 'research_area': '',
# 'quality': 'BAD',
# 'quality_reason': 'The article lacks a clear research question or hypothesis, and the methods section is not detailed enough to replicate the study. Additionally, the article does not provide any new insights or findings beyond what is already known in the field. The bibliometric analysis is a descriptive analysis and does not provide any causal relationships or explanations for the trends observed. The article could benefit from a more focused research question, a more detailed methods section, and a discussion of the implications of the findings for the field of breast cancer research.'}

#---------------------------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, TEXT is full article, temperature = 0.1 str instead of List[str] for authors and key_words -> correct output, but missing output (90 s)
#{'title': 'Visualization of breast cancer -related protein synthesis from the perspective of bibliometric analysis',
# 'authors': 'Jiawei Xu, Chengdong Yu, Xiaoqiang Zeng, Weifeng Tang, Siyi Xu, Lei Tang, Yanxiao Huang, Zhengkui Sun, Tenghua Yu',
# 'pub_year': 2023,
# 'key_words': 'Breast cancer, Bibliometric analysis, Protein synthesis', <- NOTE should be 5 keywords
# 'summary': '',
# 'research_area': '',
# 'quality': <QualityEnum.bad: 'BAD'>,
# 'quality_reason': ''}

#---------------------------------------------------------------------------------------------------------------------
# NOTE: model_mistral_Q6, TEXT is full article, temperature = 0.1 List[str] for authors and key_words -> correct output, but missing output (2 min)
#{'title': '',
# 'authors': ['Jiawei Xu', 'Chengdong Yu', 'Xiaoqiang Zeng', 'Weifeng Tang', 'Siyi Xu', 'Lei Tang', 'Yanxiao Huang', 'Zhengkui Sun', 'Tenghua Yu'],
# 'pub_year': 2023,
# 'key_words': ['Breast cancer', 'Bibliometric analysis', 'Protein synthesis', 'Breast cancer-related protein synthesis', 'Publication output trend'],
# 'summary': '',
# 'research_area': '',
# 'quality': <QualityEnum.bad: 'BAD'>,
# 'quality_reason': 'The article provides a comprehensive bibliometric analysis of breast cancer-related protein synthesis research, including publication output trend, influential journals, top institutions and countries, prominent authors, and key research areas. The study sheds light on current focus areas and areas of interest in breast cancer-related protein synthesis research, which can further contribute to the understanding, diagnosis, and treatment of breast cancer. However, the article lacks a clear research question and a more detailed discussion of the findings and their implications. Additionally, the article could benefit from a more rigorous literature review and a more nuanced analysis of the data. Overall, the article is of average quality.'}


# -----------------------------------------------------------------------------------------------------------------
# NOTE: model_mistral, TEXT is only front page -> CUDA out of memory (halucinating?) OK on CPU
# {'title': 'Xu\xa0et\xa0al.  European  Journal of Medical Research ',
#  'authors': ['Xiaoyan  Gao','Hongyan Wang','Lining Gao','Junping Yang','Chunguang  Wu'],
#  'pub_year': 2023,
#  'key_words': ['European  Journal of Medical Research','bibliometric analysis',  'medicine','bibliometrics',' federative  literature databases'],
#  'summary': 'Article is delimited by  \\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nBreast cancer,  as a daunting global health threat,\\[more] ',
#  'research_area': '',
#  'quality': <QualityEnum.bad: 'BAD'>,
#  'quality_reason': 'Passive voice count - 34\\nI'}

#------------------------------------------------------------------------------------------------------------------
# NOTE: model_phi3, TEXT is only front page -> CUDA out of memory (halucinating?)



---
---

In [None]:
Metadata.model_fields['quality'].examples

INSPIRATION

In [None]:
generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"])
generator = generate.text(model) # does not work with Pydantic
generator = generate.regex(model, r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",)
generator = generate.format(model, int)

result = generator(prompt, max_tokens=10)

#-----------------------------------------------------------------------------------------------
def add(a: int, b: int):
    return a + b

model = outlines.models.transformers("WizardLM/WizardMath-7B-V1.1")
generator = outlines.generate.json(model, add)
result = generator("Return json with two integers named a and b respectively. a is odd and b even.")

print(add(**result))
# 3

# -----------------------------------------------------------------------------------------------
import outlines

examples = [
    ("The food was disgusting", "Negative"),
    ("We had a fantastic night", "Positive"),
    ("Recommended", "Positive"),
    ("The waiter was rude", "Negative")
]

@outlines.prompt
def labelling(to_label, examples):
    """You are a sentiment-labelling assistant.

    {% for example in examples %}
    {{ example[0] }} // {{ example[1] }}
    {% endfor %}
    {{ to_label }} //
    """

model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
prompt = labelling("Just awesome", examples)
answer = outlines.generate.text(model)(prompt, max_tokens=100)