In [50]:
import urllib.request
import os
from tqdm import tqdm
from time import sleep
import math
import pdfplumber
from nltk.tokenize import word_tokenize
from datetime import datetime
from utils.gpl_tsdae import GplTsdae
import numpy as np

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

In [3]:
gpl = GplTsdae()

[2023-04-20 19:27:40] INFO [sentence_transformers.SentenceTransformer.__init__:66] Load pretrained SentenceTransformer: ./models/gpl/TSDAE/500000


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[2023-04-20 19:27:49] INFO [models.gpl.gpl.toolkit.sbert.load_sbert:68] Set max_seq_length=350
[2023-04-20 19:27:49] INFO [beir.retrieval.search.dense.faiss_search._load:39] Loading Faiss ID-mappings from path: ./models/gpl/embedding/TSDAE/my-index.flat.tsv
[2023-04-20 19:27:49] INFO [beir.retrieval.search.dense.faiss_search._load:46] Loading Faiss Index from path: ./models/gpl/embedding/TSDAE/my-index.flat.faiss


In [38]:
tokenizer = AutoTokenizer.from_pretrained("StabilityAI/stablelm-tuned-alpha-3b")
model = AutoModelForCausalLM.from_pretrained("StabilityAI/stablelm-tuned-alpha-3b")
model.half().cuda()

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50688, 4096)
    (layers): ModuleList(
      (0): GPTNeoXLayer(
        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
          (dense): Linear(in_features=4096, out_features=4096, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)
          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)
          (act): GELUActivation()
        )
      )
      (1): GPTNeoXLayer(
        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=Tr

In [40]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

In [104]:
def download_paper_and_save_as_txt(paper_id):
    if os.path.exists(f"papers_txt/{paper_id}.txt"):
        return
    for try_count in range(6):
        try:
            urllib.request.urlretrieve(f"https://export.arxiv.org/pdf/{paper_id}.pdf", f"./temp/{paper_id}.pdf")
            break
        except Exception as e:
            if try_count >= 5:
                raise e
            print(f"{try_count}-retry, get {paper_id} in 10 seconds.")
            sleep(10)
    pdfp = pdfplumber.open(f"./temp/{paper_id}.pdf")
    full_text = '\n'.join([page.extract_text() for page in pdfp.pages])
    tok = sent_tokenize(full_text)
    new_tok = []
    for s in tok:
        score = sum([ c not in "+-*/=^(){}[]0123456789!@ " and ord(c) < 128 for c in s ]) / (len(s))
        if score >= 0.8:
            new_tok.append(s)
    with open(f"papers_txt/{paper_id}.txt", "w") as f:
        f.write(' '.join(new_tok))
    os.remove(f"./temp/{paper_id}.pdf")

In [105]:
download_paper_and_save_as_txt('2010.15778')

In [81]:
!find papers_txt/13* | tail -10

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
papers_txt/1301.3834.txt
papers_txt/1301.7521.txt
papers_txt/1302.1727.txt
papers_txt/1302.3921.txt
papers_txt/1302.7145.txt
papers_txt/1303.2579.txt
papers_txt/1303.2580.txt
papers_txt/1303.5751.txt
papers_txt/1303.5768.txt
papers_txt/1304.1235.txt


In [91]:
query = "Is any part of p2p video streaming has same concept as bitcoin?"
paper_id = "2010.15778"
wc = 100

In [106]:
with open(f"papers_txt/{paper_id}.txt", "r") as f:
    paper_txt = f.read()

In [93]:
paper_ln = word_tokenize(paper_txt)
paper_ln = [' '.join(paper_ln[i:i+wc]) for i in range(0, len(paper_ln), wc)]
len(paper_ln)

286

In [94]:
paper_ln[0]

'A Review on P2P Video Streaming Sabu M. Thampi Indian Institute of Information Technology and Management – Kerala ( IIITM-K ) , India smthampi @ ieee.org The main objective of this article is to provide an overview of P2P based Video-on-Demand and live streaming services . The article starts with an introduction to media streaming and its simplified architecture . Various solutions offering video streaming in the context of widespread usage of Internet are discussed . This is followed by a short introduction to P2P networks and its applications . A broad discussion on various P2P streaming schemes and P2P'

In [95]:
corpus = gpl.sbert.encode_corpus([ {'title': '', 'text': t} for t in paper_ln ])
corpus.shape

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

(286, 768)

In [96]:
%%time
scores = np.sum(np.repeat(gpl.sbert.encode_queries([query]), corpus.shape[0], axis=0) * corpus, axis=1)
scores

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 30.2 ms, sys: 188 µs, total: 30.4 ms
Wall time: 28.2 ms


array([26.750092 , 24.725992 , 15.742205 , 17.166065 , 21.121431 ,
       18.542736 , 16.108501 , 17.961824 , 20.865734 , 21.299164 ,
       22.644932 , 18.591965 , 19.924898 , 21.53347  , 17.528465 ,
       20.219898 , 21.407192 , 19.296799 , 21.550507 , 18.321854 ,
       21.401644 , 18.272991 , 15.530848 , 19.892708 , 20.796988 ,
       20.068285 , 22.38942  , 16.38695  , 19.507298 , 20.65343  ,
       15.551199 , 14.809626 , 18.98814  , 13.134424 , 16.463787 ,
       15.876736 , 10.6173725, 18.560661 , 18.924587 , 16.15115  ,
       18.213997 , 17.67575  , 16.725698 , 15.376694 , 20.536823 ,
       18.304985 , 16.364555 , 20.491646 , 19.085903 , 18.00172  ,
       17.233421 , 18.313694 , 18.521141 , 19.368706 , 18.797323 ,
       14.689584 , 15.176687 , 15.652452 , 18.691277 , 15.748525 ,
       14.320398 , 18.210342 , 22.055038 , 20.297798 , 20.58822  ,
       18.847082 , 20.09621  , 22.848442 , 21.932081 , 21.023367 ,
       13.398948 , 18.844658 , 19.543375 , 16.631697 , 17.5059

In [97]:
%%time
raw_input = sorted(sorted(list(zip(scores, enumerate(paper_ln))), key=lambda x: -x[0])[:4], key=lambda x: x[1][0])
raw_input

CPU times: user 185 µs, sys: 0 ns, total: 185 µs
Wall time: 190 µs


[(31.044847,
  (95,
   ', P2P streaming focuses on the efficient delivery of audio and video content under stiff timing requirements . Stream data are instantaneously received , played , and passed to other associated peers . For example , the P2P file sharing application - BitTorrent permits peers to interchange any segment of the content being distributed since the order in which they arrive is not important . In contrast , such techniques are not viable in streaming applications [ 57 ] . Video files are directly played- out while they are being downloaded . Therefore , pieces , which are received after')),
 (29.283695,
  (102,
   'source or a peer . The tree-based systems typically distribute video by actively pushing data from a peer to its children peers [ 62 ] . A common approach to P2P streaming is to organize participating peers into a single tree-structured overlay over which the content is pushed from the source towards all peers e.g . This way organizing peers is called sing

In [98]:
real_input = [ t[1][1] for t in raw_input]
real_input

[', P2P streaming focuses on the efficient delivery of audio and video content under stiff timing requirements . Stream data are instantaneously received , played , and passed to other associated peers . For example , the P2P file sharing application - BitTorrent permits peers to interchange any segment of the content being distributed since the order in which they arrive is not important . In contrast , such techniques are not viable in streaming applications [ 57 ] . Video files are directly played- out while they are being downloaded . Therefore , pieces , which are received after',
 'source or a peer . The tree-based systems typically distribute video by actively pushing data from a peer to its children peers [ 62 ] . A common approach to P2P streaming is to organize participating peers into a single tree-structured overlay over which the content is pushed from the source towards all peers e.g . This way organizing peers is called single- tree streaming . In these systems , peers a

In [108]:

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

prompt = f"{system_prompt}<|USER|>{query}\n{' '.join([f'{i}) {t}' for i, t in enumerate(real_input)])} <ans> <|ASSISTANT|>"
# print(prompt)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
tokens = model.generate(
  **inputs,
  max_new_tokens=256,
  temperature=0.7,
  do_sample=True,
  stopping_criteria=StoppingCriteriaList([StopOnTokens()]),
  top_p = 0.95, top_k = 50, early_stopping = False
)

print(tokenizer.decode(tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
Is any part of p2p video streaming has same concept as bitcoin?
0), P2P streaming focuses on the efficient delivery of audio and video content under stiff timing requirements. Stream data are instantaneously received, played, and passed to other associated peers. For example, the P2P file sharing application - BitTorrent permits peers to interchange any segment of the content being distributed since the order in which they arrive is not important. In contrast, such techniques are not viable in streaming applications [ 57 ]. Video files are dir

In [109]:
ans = tokenizer.decode(tokens[0], skip_special_tokens=True)

In [None]:
ans.split('<ans>')[-1]