In [1]:
!pip install retriv
!pip install rank_bm25

Collecting retriv
  Downloading retriv-0.2.3-py3-none-any.whl.metadata (10 kB)
Collecting numba>=0.54.1 (from retriv)
  Downloading numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting optuna (from retriv)
  Downloading optuna-3.5.0-py3-none-any.whl.metadata (17 kB)
Collecting krovetzstemmer (from retriv)
  Downloading KrovetzStemmer-0.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.9/112.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pystemmer==2.0.1 (from retriv)
  Downloading PyStemmer-2.0.1.tar.gz (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.3/559.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting unidecode (from retriv)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting ranx (from retriv)
  Downloading ranx-0.3.19-

In [1]:
from retriv import HybridRetriever

hr = HybridRetriever(
    # Shared params ------------------------------------------------------------
    index_name="new-index",
    # Sparse retriever params --------------------------------------------------
    sr_model="bm25",
    min_df=1,
    tokenizer="whitespace",
    stemmer="english",
    stopwords="english",
    do_lowercasing=True,
    do_ampersand_normalization=True,
    do_special_chars_normalization=True,
    do_acronyms_normalization=True,
    do_punctuation_removal=True,
    # Dense retriever params ---------------------------------------------------
    dr_model="sentence-transformers/all-MiniLM-L6-v2",
    normalize=True,
    max_length=128,
    use_ann=True,
)


: 

In [2]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [3]:
import pickle as pkl

with open('splitDocuments.pkl','rb') as f:
  all_splits = pkl.load(f)

In [8]:
def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list


bm25_retriever = BM25Retriever.from_documents(flatten_extend(all_splits))

In [35]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain.embeddings import HuggingFaceEmbeddings



embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory="all-MiniLM-L6-v2DB", embedding_function=embedding_function)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [36]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5])


In [40]:
ensemble_retriever.get_relevant_documents("What number do all of the LTI classes start with")

[Document(page_content='(b) How do you interpret the term “promising”? (c) What do you think the research community prioritized when you started? (d) What was the scope of the work that your research group did? (e) What was your relationship to computing resources at the start of your career? (f) What did your software workflow look like when you first started doing research? (Prompt:\n\nWhat tools, frameworks, libraries did you use?)', metadata={'source': 'Web Scholar PDFs/1433b8d43d446fcc7f3e1370b22f744a4dd7c8e4.pdf'}),
 Document(page_content='eassignei=TBegin(6)ei+1=...=ej=TIntermediate(7)ThepredictionofBIOtagsismodeledasamulti-classclassificationproblemwiththeobjectiveasLEE=EDhXi−logpθ(ei|[c,v,ρ])i(8)DocumentClassificationWeusetheembeddingofthestarting[CLS]tokenfordocumentclassifica-tion.ThelogitsarepredictedwithanMLPheadontopofthe[CLS]embedding.Letlbethecorrectclass,theobjectiveisLDC=EDh−logpθ(l|[c,v,ρ])i(9)BAdditionalRelatedWorksB.1DatasetsSmallerdocumentdatasetsTheFormUnder-stan

In [41]:
question = "What is Andrew Carnegie known for?"
vectorstore.similarity_search(question)


[Document(page_content='Andrew Carnegie\n\nA self-educated "working boy" who loved books, Andrew Carnegie emigrated from Scotland in 1848 and settled in Pittsburgh, Pa. Attending night school and borrowing books, Carnegie went from factory worker in a textile mill to successful entrepreneur and industrialist. He rose to prominence by founding what became the world\'s largest steel producing company by the end of the 19th century.', metadata={'source': 'Data/history_of_cmu/01.txt'}),
 Document(page_content='engineer students to take courses in humanities and social sciences in order to better understand the needs of society. Carnegie died in 1919, but his vision for an educated public lived on after him.', metadata={'source': 'Data/history_of_cmu/03.txt'}),
 Document(page_content='Carnegie Tech\n\nPost\n\nwar Years', metadata={'source': 'Data/history_of_cmu/04.txt'}),
 Document(page_content='At one point the richest man in the world, Carnegie believed that "to die rich is to die disgrac

In [45]:
# 1. test hybrid retriever
# 2. llm-embedder + bge reranker
# 3. filco context filtering


!pip install small-text



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting small-text
  Downloading small_text-1.3.3-py3-none-any.whl.metadata (12 kB)
Downloading small_text-1.3.3-py3-none-any.whl (205 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.7/205.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: small-text
Successfully installed small-text-1.3.3


In [None]:
from transformers import AutoTokenizer
from small_text import TransformersDataset

transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

In [51]:
f = open("FINALQUESTIONS.txt")
targets = f.readlines()
target = [i.strip() for i in targets]
f.close()


f = open("types.txt")
labels = f.readlines()
labels = [i.strip() for i in labels]
f.close()

In [52]:
set(labels)

{'Academic Calendar',
 'Buggy News',
 'Faculty',
 'History of CMU',
 'History of Drama',
 'History of SCS',
 'Kiltie Band',
 'Programs',
 'Research',
 'Tartan Facts'}

In [63]:
finalTargets = []
finalLabels = []

for count, i in enumerate(labels):
    if "kiltie" in targets[count].lower() or i == "Kiltie Band":
    #     finalLabels.append("kiltie")

        continue
    

    if i == "History of CMU" or i == "History of Drama" or i == "History of SCS":
        finalLabels.append(0)

    if i == "Programs" or i == "Faculty":
        finalLabels.append(1)

    if i == "Academic Calendar":
        finalLabels.append(2)

    if i == "Research":
        finalLabels.append(3)

    if i == "Tartan Facts" or "Buggy News":
        finalLabels.append(4)

    finalTargets.append(targets[count])

        


In [64]:
from small_text import TransformersDataset
import numpy as np

# target_labels = np.arange(num_classes)
from transformers import AutoTokenizer

transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

train = TransformersDataset.from_arrays(finalTargets,
                                        finalLabels,
                                        tokenizer,
                                        max_length=60,
                                        target_labels=len(set(finalTargets)))



In [65]:
from small_text import (
    PoolBasedActiveLearner,
    PredictionEntropy,
    TransformerBasedClassificationFactory,
    TransformerModelArguments,
    random_initialization_balanced
)


# simulates an initial labeling to warm-start the active learning process
def initialize_active_learner(active_learner, y_train):

    indices_initial = random_initialization_balanced(y_train, n_samples=20)
    active_learner.initialize_data(indices_initial, y_train[indices_initial])

    return indices_initial


transformer_model = TransformerModelArguments(transformer_model_name)
clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                    len(set(finalTargets)), 
                                                    kwargs=dict({'device': 'cpu', 
                                                                 'mini_batch_size': 32,
                                                                 'class_weight': 'balanced'
                                                                }))
query_strategy = PredictionEntropy()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
indices_labeled = initialize_active_learner(active_learner, train.y)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [66]:
for i in range(10):
    # ...where each iteration consists of labelling 20 samples
    indices_queried = active_learner.query(num_samples=20)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[indices_queried]

    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    indices_labeled = np.concatenate([indices_queried, indices_labeled])
    
    print('---------------')
    print(f'Iteration #{i} ({len(indices_labeled)} samples)')
    # results.append(evaluate(active_learner, train[indices_labeled], test))

---------------
Iteration #0 (40 samples)


KeyboardInterrupt: 