### MARKDOWN


In [9]:
# 테이블 설정 로드
from omegaconf import OmegaConf
config = OmegaConf.load("config/credentials.yml")

In [2]:
#
from transformers import pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

labels = ['spam', 'not spam']
hypothesis_template = 'This email is {}.'
email = 'click this link for sale!'
results = classifier(
        email, labels, hypothesis_template=hypothesis_template
    )

In [9]:
results['labels'][0]

'not spam'

In [26]:
from openai import OpenAI
import pandas as pd

# OpenAI API 키 설정
api_key = config['openai_key']
client = OpenAI(api_key=api_key)


# 데이터
data = {
    "PRODUCT ID": ["B001E4KFG0", "B00813GRG4"],
    "USER ID": ["A3SGXH7AUHU8GW", "A1D87F6ZCVE5NK"],
    "SCORE": [5, 1],
    "SUMMARY": ["Good Quality Dog Food", "Not as Advertised"],
    "TEXT": [
        "I have bought several of the Vitality canned...",
        "Product arrived labeled as Jumbo Salted Peanut..."
    ]
}

# DataFrame 생성
df = pd.DataFrame(data)
df["combined"] = (
    "Title: " + df.SUMMARY.str.strip() + "; Content: " + df.TEXT.str.strip()
)


def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


# 여러 텍스트에 대한 임베딩 얻기
embeddings = [get_embedding(text) for text in texts]

# for text, embedding in zip(texts, embeddings):
#     print(f"Text: {text}")
#     print(f"Embedding: {embedding}")


### 사전 학습모델로 임베딩하기

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    'sentence-transformers/multi-qa-mpnet-base-cos-v1'
)

docs =[
    "Around 9 million people live in London",
    "London is known for its financial district"
]

doc_emb = model.encode(
    docs,
    batch_size=32,
    show_progress_bar=True
)

In [53]:
import PyPDF2
from tqdm import tqdm

with open('./data/pds2.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)

    principles_of_ds = ''

    for page in tqdm(reader.pages):
        text = page.extract_text()

        principles_of_ds += '\n\n' + text[text.find(' ]')+2:]

principles_of_ds = principles_of_ds.strip()

100%|██████████| 428/428 [13:01<00:00,  1.83s/it]


### 중첩을 포함하는 또는 포함하지 않는 교과서 분할하기

In [54]:
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: 각 조강에 들어갈 최대 토큰 수
    overlapping_factor: 각 조각이 시작할 때 이전 청크와 중첩되는 문장의 숫자
    '''

    # Split the text using punctuation
    sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)

    #BERT로 token화 -> 사전 학습된 언어 모델의 어휘(vocabulary)에 매핑,  각 숫자는 텍스트의 특정 단어 또는 하위 단어(subword)에 해당
    # 각 문장의 토큰 수 얻기
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

    chunks, tokens_so_far, chunk = [], 0, []

    # 튜플로 결합된 문장과 토큰을 반복해서 처리하기
    for sentence, token in zip(sentences, n_tokens):
        if tokens_so_far + token > max_tokens:
            #max_token수를 넘기면 chunk들을 paste 후 chunks 목록에 추가하기
            #chunk 리셋
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0
        
        if token > max_tokens:
            continue

        chunk.append(sentence)
        tokens_so_far += token + 1

    if chunk:
        chunks.append(". ".join(chunk) + ".")
    
    return chunks


In [56]:
split = overlapping_chunks(principles_of_ds, overlapping_factor=0)
avg_length =sum([len(tokenizer.encode(t)) for t in split]) / len(split)

In [57]:
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 299 documents with average length 443.0 tokens


In [59]:
split = overlapping_chunks(principles_of_ds, overlapping_factor=5)
avg_length =sum([len(tokenizer.encode(t)) for t in split]) / len(split)

In [60]:
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 420 documents with average length 451.4 tokens
