In [1]:
import pandas as pd
from datasets import load_dataset, Dataset

In [2]:
dataset = load_dataset("Seongill/nq_squad", split="train")
df = pd.DataFrame(dataset)
df = df.drop_duplicates(subset=['question'], keep='first')
dataset = Dataset.from_pandas(df)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'masked_query', 'query_embedding', '__index_level_0__'],
    num_rows: 97888
})

In [3]:
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'masked_query', 'query_embedding'],
    num_rows: 97888
})

In [4]:
dataset.push_to_hub("SQuAD_unique_questions")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

In [55]:
api = wandb.Api()
run_path = "athjk3/rag/yx8t14jr"
run = api.run(run_path)
table_name = run_path.split("/")[-1]
table = run.logged_artifacts()[1]
table_dir = table.download()
table_path = f"artifacts/run-{table_name}-rawdata:v0/raw-data.table.json"
with open(table_path) as file:
    json_dict = json.load(file)
df = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])
cal_metric(df)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


EM: 1.636
EM-Un: 0.116
Acc: 38.551
Acc-Un: 2.091


In [31]:
def cal_metric(data: pd.DataFrame):
    em_ans = data[data.answers != "unanswerable"].is_exact_match.mean().round(5)*100
    em_unans = data[data.answers == "unanswerable"].is_exact_match.mean().round(5)*100
    acc_ans = data[data.answers != "unanswerable"].is_accurate.mean().round(5)*100
    acc_unans = data[data.answers == "unanswerable"].is_accurate.mean().round(5)*100
    print("EM:",em_ans)
    print("EM-Un:", em_unans)
    print("Acc:", acc_ans)
    print("Acc-Un:", acc_unans)

In [5]:
import joblib
data = joblib.load("test.joblib")

In [8]:
data['who got the first nobel prize in physics']['conflict_case']

[{'question': 'who was a professor of the University of Washington?',
  'original_context': 'The late Turrell V. Wylie, a former professor of the University of Washington, and Li Tieh-tseng argue that the reliability of the heavily censored History of Ming as a credible source on Sino-Tibetan relations is questionable, in the light of modern scholarship. Other historians also assert that these Ming titles were nominal and did not actually confer the authority that the earlier Yuan titles had. Van Praag writes that the "numerous economically motivated Tibetan missions to the Ming Court are referred to as \'tributary missions\' in the Ming Shih." Van Praag writes that these "tributary missions" were simply prompted by China\'s need for horses from Tibet, since a viable horse market in Mongol lands was closed as a result of incessant conflict. Morris Rossabi also writes that "Tibet, which had extensive contacts with China during the Yuan, scarcely had diplomatic relations with the Ming."'

In [1]:

from openai import OpenAI
import os
from eval import reduce_prompts

client = OpenAI(api_key=os.getenv("OPENAI_APIKEY"))
response = client.completions.create(
model="gpt-3.5-turbo-instruct",
prompt=["How many people live in Berlin?", "The population of the Berlin is"],
max_tokens=10,
seed=42,
temperature=0,
logprobs=5
)



In [2]:
response.choices

[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[31, 33, 35, 38, 39, 42, 43, 44, 48, 58], token_logprobs=[-0.11162185, -0.11001924, -0.0012351145, -0.09309951, -0.006339453, -0.07861819, -4.429897e-05, -0.19639036, -0.05182585, -4.2272506e-05], tokens=['\n\n', 'As', ' of', ' ', '202', '1', ',', ' the', ' estimated', ' population'], top_logprobs=[{'\n\n': -0.11162185, '\n': -2.656376, 'As': -4.4116535, ' \n\n': -4.9855194, 'There': -5.7700443}, {'As': -0.11001924, 'According': -2.357822, 'The': -5.6916766, 'In': -6.3065066, 'There': -6.8061194}, {' of': -0.0012351145, ' a': -7.2832594, ' per': -8.5903425, ' an': -8.662898, ' the': -9.948432}, {' ': -0.09309951, ' October': -3.8838139, ' December': -4.2973747, ' January': -4.414011, ' September': -4.5788994}, {'202': -0.006339453, '201': -5.0706573, '2': -11.505273, '1': -11.710988, '<|endoftext|>': -11.966802}, {'1': -0.07861819, '0': -2.6655304, '2': -5.1097364, '<|endoftext|>': -12.523534, '3': -14.067

In [5]:
response.choices[0].text

'\n\nAs of 2021, the estimated population'

In [6]:
response.choices[1].text

' 3.7 million people as of 202'

In [6]:
response.choices[1].text

' 3.7 million people as of 202'

In [9]:
reduce_prompts(response.choices, [1.0, 1.0])

' 3.7 million people, of 202'

In [11]:
head, tail = response.choices

In [24]:
head.logprobs.top_logprobs

[{'\n\n': -0.11162185,
  '\n': -2.656376,
  'As': -4.4116535,
  ' \n\n': -4.9855194,
  'There': -5.7700443},
 {'As': -0.11001924,
  'According': -2.357822,
  'The': -5.6916766,
  'In': -6.3065066,
  'There': -6.8061194},
 {' of': -0.0012351145,
  ' a': -7.2832594,
  ' per': -8.5903425,
  ' an': -8.662898,
  ' the': -9.948432},
 {' ': -0.09309951,
  ' October': -3.8838139,
  ' December': -4.2973747,
  ' January': -4.414011,
  ' September': -4.5788994},
 {'202': -0.006339453,
  '201': -5.0706573,
  '2': -11.505273,
  '1': -11.710988,
  '<|endoftext|>': -11.966802},
 {'1': -0.07861819,
  '0': -2.6655304,
  '2': -5.1097364,
  '<|endoftext|>': -12.523534,
  '3': -14.067002},
 {',': -4.429897e-05,
  ' estimates': -11.0083275,
  ' the': -12.142431,
  ',the': -12.327835,
  ' ,': -12.710148},
 {' the': -0.19639036,
  ' approximately': -2.1362822,
  ' it': -2.873669,
  ' Berlin': -6.67231,
  ' an': -6.9121857},
 {' estimated': -0.05182585,
  ' population': -3.022754,
  ' current': -6.709619,
  '

In [17]:
head.

AttributeError: 'CompletionChoice' object has no attribute 'tokens'

In [1]:
import transformers
model_path = 'microsoft/Orca-2-13b'

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=4096,
    padding_side="right",
    use_fast=False,
    add_special_tokens=False,
)

In [5]:
tokenizer(["How many people live in Berlin?", "The population of the Berlin is"], return_tensors="pt", padding="longest")

{'input_ids': tensor([[    1,  1128,  1784,  2305,  5735,   297,  5115, 29973],
        [    1,   450,  4665,   310,   278,  5115,   338, 32000]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])}

In [6]:
import torch
import transformers

if torch.cuda.is_available():
    torch.set_default_device("cuda")
else:
    torch.set_default_device("cpu")
    
model = transformers.AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b", device_map='auto')

# https://github.com/huggingface/transformers/issues/27132
# please use the slow tokenizer since fast and slow tokenizer produces different tokens
tokenizer = transformers.AutoTokenizer.from_pretrained(
        "microsoft/Orca-2-13b",
        use_fast=False,
    )

system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
user_message = "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?"

prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
prompts = [prompt, prompt+" I think It's go--od"]

In [30]:

prompt ='<|im_start|>system\nYou are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.<|im_end|>\n<|im_start|>user\nSolve the following math problem. Please separate into two categories labeled with ”Solution:” and ”Final answer(in numbers):”\n\nProblem: Charles is moving from Springfield, which has 482,653 people, to Greenville, which has 119,666 fewer people. What is the total population of Springfield and Greenville?<|im_end|>\n<|im_start|>assistant', '<|im_start|>system\nYou are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.<|im_end|>\n<|im_start|>user\nSolve the following math problem. Please separate into two categories labeled with ”Solution:” and ”Final answer(in numbers):”\n\nProblem: Libby has 160 quarters in her piggy bank. She has to pay $35 to replace her sister’s dress that she borrowed and ruined. After replacing the dress, how many quarters will Libby have left?<|im_end|>\n<|im_start|>assistant', '<|im_start|>system\nYou are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.<|im_end|>\n<|im_start|>user\nSolve the following math problem. Please separate into two categories labeled with ”Solution:” and ”Final answer(in numbers):”\n\nProblem: Michael has a chicken farm. His chickens increase in number by 150 chickens annually. If the number of chickens on his farm now is 550, how many will he have after 9 years?<|im_end|>\n<|im_start|>assistant', '<|im_start|>system\nYou are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.<|im_end|>\n<|im_start|>user\nSolve the following math problem. Please separate into two categories labeled with ”Solution:” and ”Final answer(in numbers):”\n\nProblem: Megan is an actress. She was the lead actress in 80% of her work. In total, Megan participated in 100 plays. How many times Megan was not the lead actress?<|im_end|>\n<|im_start|>assistant'
inputs = tokenizer(prompt, return_tensors='pt', padding="longest")

In [31]:
outputs = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.6, top_p=0.9)

In [33]:
out = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [35]:
out.split("<|im_start|> assistant")[-1].strip()

'solution: To find the total population of Springfield and Greenville, we need to add the number of people in each city. We can use the following formula:\n\ntotal population = population of Springfield + population of Greenville\n\nWe know that the population of Springfield is 482,653 and the population of Greenville is 119,666 fewer than Springfield. We can write this as:\n\npopulation of Greenville = population of Springfield - 119,666\n\nWe can substitute this expression into the formula to get:\n\ntotal population = 482,653 + (482,653 - 119,666)\n\nWe can simplify this by using the order of operations. We start by performing the subtraction inside the parentheses:\n\ntotal population = 482,653 + 362,987\n\nThen we perform the addition of the two numbers:\n\ntotal population = 845,640\n\nFinal answer: The total population of Springfield and Greenville is 845,640 people.'

In [1]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
wv_from_text = KeyedVectors.load_word2vec_format(datapath('/data/seongil/wiki/enwiki_20180420_100d.txt'), binary=False)

In [5]:
word_list = list(wv_from_text.key_to_index)
two_gram = [word for word in word_list if len(word.split()) > 1]
two_gram

['ENTITY/World_War\xa0II',
 'ENTITY/World_War\xa0I',
 'ENTITY/8\xa0Flora',
 'ENTITY/Filmfare_Award_for_Best_Actor\xa0–_Tamil',
 'ENTITY/U.S._Route\xa09W',
 'ENTITY/15\xa0Eunomia',
 'ENTITY/Golden_Globe_Award_for_Best_Actor\xa0–_Motion_Picture_Drama',
 'ENTITY/Golden_Globe_Award_for_Best_Actor\xa0–_Motion_Picture_Musical_or_Comedy',
 "ENTITY/Gymnastics_at_the_2016_Summer_Olympics\xa0–_Women's_rhythmic_individual_all-around",
 'ENTITY/IUCN_Red_List\xa0of_Endangered_Species',
 'ENTITY/Golden_Globe_Award_for_Best_Motion_Picture\xa0–_Musical_or_Comedy',
 'ENTITY/Highway\xa0401',
 'ENTITY/25\xa0Phocaea',
 "ENTITY/Cycling_at_the_2012_Summer_Olympics\xa0–_Men's_road_race",
 'ENTITY/Filmfare_Award_for_Best_Director\xa0–_Tamil',
 'ENTITY/George_H.\xa0W._Bush',
 'ENTITY/U.S._Route\xa0104',
 'ENTITY/Associação_Académica_de_Coimbra\xa0–_O.A.F.',
 'ENTITY/Filmfare_Award_for_Best_Film\xa0–_Tamil',
 'ENTITY/Malcolm\xa0X',
 'ENTITY/Filmfare_Award_for_Best_Actress\xa0–_Tamil',
 'ENTITY/Elizabeth\xa0II',

In [1]:
#closest_word = find_closest_word("Moreau_Seminary", word_list)
#print(closest_word)
result = wv_from_text.similar_by_word("ENTITY/Super Bowl", topn=10)
# most_similar_key, similarity = result[0]
# print(f"{most_similar_key}: {similarity:.4f}")

NameError: name 'wv_from_text' is not defined

In [3]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_lg")
doc = nlp("Moreau Seminary")
doc.vector

array([-2.51940012e+00, -1.13528502e+00,  3.30950022e-01,  2.04909992e+00,
        2.04286003e+00,  1.18479997e-01,  3.91735005e+00, -1.68214488e+00,
        1.37219000e+00,  3.20243508e-01,  9.84349966e-01, -5.72250038e-02,
        1.43589997e+00, -1.90299988e-01, -1.27500498e+00,  8.81399810e-02,
        1.77349985e-01,  5.42749977e+00, -2.06948495e+00,  1.92012501e+00,
        1.02176499e+00,  7.51715004e-01, -8.89654994e-01,  6.90450072e-02,
        1.80575013e-01, -1.04799008e+00, -2.26014996e+00,  1.23315001e+00,
       -6.74650073e-02,  9.30034995e-01, -1.23273993e+00,  6.99105024e-01,
       -4.29194987e-01, -8.07099938e-01, -2.76644993e+00, -1.13724506e+00,
        1.20839000e+00, -2.98460007e+00, -6.38900042e-01, -3.98400009e-01,
       -9.55365002e-01, -1.17315996e+00, -1.72068942e+00,  3.08129978e+00,
       -1.39999390e-03, -1.93981004e+00,  1.28809500e+00,  1.07879996e-01,
        9.60620046e-01, -1.01398945e+00, -2.90540004e+00,  1.58119977e-01,
       -3.20090508e+00,  

In [5]:
type(doc.vector.get())

numpy.ndarray

In [None]:
import nltk

In [7]:
import cupy as cp
a = cp.array([doc.vector.get()]*5)
a.shape

(5, 300)

In [14]:
cp.linalg.norm(doc.vector / cp.linalg.norm(doc.vector))

array(1., dtype=float32)

In [18]:
import numpy as np

# 예시 dictionary
dict_of_arrays = {
    "key1": [np.array([1, 2, 3]), np.array([4, 5, 6])],
    "key2": [np.array([7, 8, 9]), np.array([10, 11, 12])]
}

# Dictionary의 각 요소를 순회하면서 정규화
for key in dict_of_arrays:
    # 리스트 컴프리헨션을 사용하여 각 배열을 정규화
    normalized_arrays = [arr / np.linalg.norm(arr) for arr in dict_of_arrays[key]]
    # 정규화된 배열들을 NumPy 배열로 변환
    dict_of_arrays[key] = np.array(normalized_arrays)

# 결과 확인
print(dict_of_arrays)


{'key1': array([[0.26726124, 0.53452248, 0.80178373],
       [0.45584231, 0.56980288, 0.68376346]]), 'key2': array([[0.50257071, 0.57436653, 0.64616234],
       [0.52342392, 0.57576631, 0.62810871]])}


In [31]:
!export GENSIM_DATA_DIR=/data/seongil/datasets/genism

In [33]:
model = api.load("glove-wiki-gigaword-300")



In [27]:
api.info()

{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1,
   'record_format': 'dict',
   'file_size': 6344358,
   'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py',
   'license': 'All files released for the task are free for general research use',
   'fields': {'2016-train': ['...'],
    '2016-dev': ['...'],
    '2017-test': ['...'],
    '2016-test': ['...']},
   'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.',
   'checksum': '701ea67acd82e75f95e1d8e62fb0ad29',
   'file_name': 'se

In [46]:
model.similarity('feeder', "dog")

0.10208548

In [37]:
from nltk.corpus import wordnet as wn
import random
def _sample_token_same_level_based_on_pos(token, pos):
    possible_candidates = []
    synsets = wn.synsets(token)
    if len(synsets) == 0:
        return possible_candidates
    same_synsets = []
    for synset in synsets:
        if synset.lemma_names()[0] == token and synset.lexname().split(".")[0] == pos:
            same_synsets.append(synset)
    possible_candidates = []
    for synset in same_synsets:
        hypernyms = synset.hypernyms()
        for hyper in hypernyms:
            hypos = hyper.hyponyms()
            hypos = [h.lemma_names()[0].replace("_", " ") for h in hypos if h.lemma_names()[0] != token]
            possible_candidates.extend(hypos)
    return possible_candidates

In [40]:
_sample_token_same_level_based_on_pos("dog", "noun")

['bitch',
 'fox',
 'hyena',
 'jackal',
 'wild dog',
 'wolf',
 'domestic cat',
 'feeder',
 'head',
 'stocker',
 'stray']

In [15]:
from nltk.corpus import wordnet as wn

# 찾고자 하는 단어를 정의
word = 'dog'

# 단어의 의미(synsets)를 가져옴
synsets = wn.synsets(word)

# 각 의미에 대한 hypernyms(상위어)를 찾고 출력
for syn in synsets:
    # synset의 상위어(hypernyms)를 가져옴
    hypernyms = syn.hypernyms()
    print(f"Synset: {syn.name()}")
    print("Hypernyms:")
    for hyper in hypernyms:
        print(f" - {hyper.name()}")
    print()


Synset: dog.n.01
Hypernyms:
 - canine.n.02
 - domestic_animal.n.01

Synset: frump.n.01
Hypernyms:
 - unpleasant_woman.n.01

Synset: dog.n.03
Hypernyms:
 - chap.n.01

Synset: cad.n.01
Hypernyms:
 - villain.n.01

Synset: frank.n.02
Hypernyms:
 - sausage.n.01

Synset: pawl.n.01
Hypernyms:
 - catch.n.06

Synset: andiron.n.01
Hypernyms:
 - support.n.10

Synset: chase.v.01
Hypernyms:
 - pursue.v.02

