In [2]:
import pandas as pd

qrels = pd.read_csv("test.tsv", sep="\t", header=0, names=["qid", "docno", "label"])

In [3]:
qrels

Unnamed: 0,qid,docno,label
0,1,31715818,1
1,3,14717500,1
2,5,13734012,1
3,13,1606628,1
4,36,5152028,1
...,...,...,...
334,1379,17450673,1
335,1382,17755060,1
336,1385,306006,1
337,1389,23895668,1


In [4]:
# transform to TREC format
# add a iter column all set to 0
qrels["iter"] = 0
qrels

Unnamed: 0,qid,docno,label,iter
0,1,31715818,1,0
1,3,14717500,1,0
2,5,13734012,1,0
3,13,1606628,1,0
4,36,5152028,1,0
...,...,...,...,...
334,1379,17450673,1,0
335,1382,17755060,1,0
336,1385,306006,1,0
337,1389,23895668,1,0


In [None]:
# new order: qid, iter, docno, label
qrels = qrels[["qid", "iter", "docno", "label"]]
qrels = qrels.sort_values(by=["qid", "iter", "docno"])


Unnamed: 0,qid,iter,docno,label
0,1,0,31715818,1
1,3,0,14717500,1
2,5,0,13734012,1
3,13,0,1606628,1
4,36,0,5152028,1
...,...,...,...,...
332,1379,0,27123743,1
335,1382,0,17755060,1
336,1385,0,306006,1
337,1389,0,23895668,1


In [6]:
# save to text with space separator
qrels.to_csv("test.trec", sep=" ", index=False, header=False)

In [1]:
def process_qrels(input_file, output_file):
    import pandas as pd

    # Read the input file
    qrels = pd.read_csv(input_file, sep="\t", header=0, names=["qid", "docno", "label"])
    
    # Add a column 'iter' with all values set to 0
    qrels["iter"] = 0
    
    # Reorder columns and sort the dataframe
    qrels = qrels[["qid", "iter", "docno", "label"]]
    qrels = qrels.sort_values(by=["qid", "iter", "docno"])
    
    # Save the processed dataframe to the output file
    qrels.to_csv(output_file, sep=" ", index=False, header=False)

In [2]:
process_qrels("nfcorpus-qrels/test.tsv", "nfcorpus-qrels/test.trec")

In [4]:
process_qrels("nfcorpus/qrels/dev.tsv", "nfcorpus/qrels/dev.trec")

## Process SCIDOCS to smaller dataset

In [7]:
import json
import pandas as pd
docs = []
queries = []
with open("scidocs/corpus.jsonl", "r") as f:
    for line in f:
        docs.append(json.loads(line))

with open("scidocs/queries.jsonl", "r") as f:
    for line in f:
        queries.append(json.loads(line))
queries = pd.DataFrame(queries)
print(f"queries: {len(queries)}")
display(queries.head())

print(f"docs: {len(docs)}")
docs = pd.DataFrame(docs)
display(docs.head())

queries: 1000


Unnamed: 0,_id,text,metadata
0,78495383450e02c5fe817e408726134b3084905d,A Direct Search Method to solve Economic Dispa...,"{'authors': ['50306438', '15303316', '1976596'..."
1,7dcb308b9292a8bc87d6f7793d2ca5e0e19dfa40,Bearish-Bullish Sentiment Analysis on Financia...,"{'authors': ['2243444', '32946276', '3349721']..."
2,8c872ecd87945e71fcd9fa1b6cb1133cfe805bf2,Predicting defects in SAP Java code: An experi...,"{'authors': ['1816608', '2031097', '39496137',..."
3,3a63667284dc8b9687ed1620406030bfe39af3c9,Active-Metric Learning for Classification of R...,"{'authors': ['2447432', '2818592', '15760889']..."
4,071f47b7bc5830643e31dbed82e0375bf9b26559,Ad Hoc Retrieval Experiments Using WordNet and...,"{'authors': ['1921454', '37090109', '2911717',..."


docs: 25657


Unnamed: 0,_id,title,text,metadata
0,632589828c8b9fca2c3a59e97451fde8fa7d188d,A hybrid of genetic algorithm and particle swa...,An evolutionary recurrent network which automa...,"{'authors': ['1725986'], 'year': 2004, 'cited_..."
1,86e87db2dab958f1bd5877dc7d5b8105d6e31e46,A Hybrid EP and SQP for Dynamic Economic Dispa...,Dynamic economic dispatch (DED) is one of the ...,"{'authors': ['30728239', '49115828', '1857220'..."
2,2a047d8c4c2a4825e0f0305294e7da14f8de6fd3,Genetic Fuzzy Systems - Evolutionary Tuning an...,It's not surprisingly when entering this site ...,"{'authors': ['1685850', '1699069', '34695695',..."
3,506172b0e0dd4269bdcfe96dda9ea9d8602bbfb6,A modified particle swarm optimizer,"In this paper, we introduce a new parameter, c...","{'authors': ['8385459', '4298485'], 'year': 19..."
4,51317b6082322a96b4570818b7a5ec8b2e330f2f,Identification and control of dynamic systems ...,This paper proposes a recurrent fuzzy neural n...,"{'authors': ['34448377', '2062864'], 'year': 2..."


In [17]:
scidocs_qrels = pd.read_csv("scidocs/qrels/test.tsv", sep="\t", header=0, names=["qid", "docno", "label"])

positive_doc_ids = set(scidocs_qrels[scidocs_qrels["label"] == 1]["docno"].values)
print(f"positive_doc_ids: {len(positive_doc_ids)}")

new_qrels = [] # only keep rows with docno in positive_doc_ids
for index, row in scidocs_qrels.iterrows():
    if row["docno"] in positive_doc_ids:
        new_qrels.append(row)
scidocs_qrels = pd.DataFrame(new_qrels)

# save scidocs_qrels to .tsv with header qid, docno, label
scidocs_qrels = scidocs_qrels[["qid", "docno", "label"]]
scidocs_qrels = scidocs_qrels.sort_values(by=["qid", "docno"])
scidocs_qrels.to_csv("scidocs/qrels/cut.tsv", sep="\t", index=False, header=True)

# transform to TREC format

scidocs_qrels["iter"] = 0
scidocs_qrels = scidocs_qrels[["qid", "iter", "docno", "label"]]
scidocs_qrels = scidocs_qrels.sort_values(by=["qid", "iter", "docno"])
scidocs_qrels.to_csv("scidocs/qrels/cut.trec", sep=" ", index=False, header=False)

positive_doc_ids: 4020


In [43]:
def shrink_dataset(input_path, output_path):
    """
    Create a smaller version of the given dataset by keeping only documents and queries in qrels.
    
    Args:
        input_path (str): Path to input SCIDOCS directory
        output_path (str): Path to output directory for smaller dataset
    """
    # Read qrels
    dtypes = {"qid": str, "docno": str, "label": int}
    cut_qrels = pd.read_csv(f"{input_path}/qrels/cut.tsv", sep="\t", dtype=dtypes, header=0, names=["qid", "docno", "label"])
    
    # Get unique qids and docnos
    qids = set(cut_qrels["qid"].values)
    docnos = set(cut_qrels["docno"].values)
    print(f"qids: {len(qids)}")
    print(f"docnos: {len(docnos)}")

    # Filter queries
    cut_queries = []
    with open(f"{input_path}/queries.jsonl", "r") as f:
        for line in f:
            query = json.loads(line)
            if query["_id"] in qids:
                cut_queries.append(query)

    # Filter documents
    cut_docs = []
    with open(f"{input_path}/corpus.jsonl", "r") as f:
        for line in f:
            doc = json.loads(line)
            if doc["_id"] in docnos:
                cut_docs.append(doc)

    # Save filtered queries and documents
    with open(f"{output_path}/queries_cut.jsonl", "w") as f:
        for query in cut_queries:
            f.write(json.dumps(query) + "\n")
            
    with open(f"{output_path}/corpus_cut.jsonl", "w") as f:
        for doc in cut_docs:
            f.write(json.dumps(doc) + "\n")


In [44]:
# shrink fiqa
shrink_dataset("fiqa", "fiqa")

qids: 2089
docnos: 4000


## Process FiQA to smaller dataset

In [33]:
corpus_path = "/home/guest/r12922050/GitHub/d2qplus/data/fiqa/corpus.jsonl"
import json
with open(corpus_path, "r") as f:
    docs = [json.loads(line) for line in f]
print(f"length of total docs: {len(docs)}")

length of total docs: 57600


In [32]:
empty_doc_ids = set()
has_text_docs = []
for doc in docs:
    if doc['text'] is None or doc['text'] == "":
        empty_doc_ids.add(doc['_id'])
    else:
        has_text_docs.append(doc)
print(f"length of empty docs: {len(empty_doc_ids)}")
print(f"length of docs with text: {len(has_text_docs)}")

# save has_text_docs to fiqa/corpus.jsonl
with open("/home/guest/r12922050/GitHub/d2qplus/data/fiqa/corpus.jsonl", "w") as f:
    for doc in has_text_docs:
        f.write(json.dumps(doc) + "\n")

length of empty docs: 38
length of docs with text: 57600


In [34]:
import pandas as pd
test_qrels = pd.read_csv("/home/guest/r12922050/GitHub/d2qplus/data/fiqa/qrels/test.tsv", sep="\t")

test_qid_set = set(test_qrels["query-id"].values)
print(f"length of test_qid_set: {len(test_qid_set)}")
test_qrels

length of test_qid_set: 648


Unnamed: 0,query-id,corpus-id,score
0,8,566392,1
1,8,65404,1
2,15,325273,1
3,18,88124,1
4,26,285255,1
...,...,...,...
1700,11039,330058,1
1701,11039,91183,1
1702,11054,155053,1
1703,11054,321015,1


In [35]:
positive_doc_ids = set(test_qrels[test_qrels["score"] == 1]["corpus-id"].astype(str).values) # type cast because positive_doc_ids are integers (auto-detected by pandas)
len(positive_doc_ids)

1705

代表 648 個 testing queries 對應到 1706 個 documents

控制 small dataset 大小為 5000 documents (因此扣除 1706 positive documents, 要再 random sample 3294 個 negative documents)

In [36]:
docid2doc = {doc["_id"]: doc for doc in docs}
split_docs = []
for doc_id in positive_doc_ids:
    if docid2doc[doc_id]["text"] == "":
        print(f"doc_id {doc_id} has empty text, skipping")
        continue
    if doc_id in docid2doc:
        split_docs.append(docid2doc[doc_id])
print(f"length of split_docs: {len(split_docs)}")

length of split_docs: 1705


In [37]:
# start sampling
import random

GOAL_DOC_NUM = 5000 
random.seed(24)  # for reproducibility

docid_left = set(docid2doc.keys()) - positive_doc_ids
assert len(docid_left) == len(docid2doc) - len(positive_doc_ids)

sampled_docs = random.sample(docid_left, GOAL_DOC_NUM - len(positive_doc_ids))
for doc_id in sampled_docs:
    if docid2doc[doc_id]['text'] != "":
        split_docs.append(docid2doc[doc_id])
print(f"length of split_docs after sampling: {len(split_docs)}")

length of split_docs after sampling: 5000


since Python 3.9 and will be removed in a subsequent version.
  sampled_docs = random.sample(docid_left, GOAL_DOC_NUM - len(positive_doc_ids))


In [38]:
with open(f"/home/guest/r12922050/GitHub/d2qplus/data/fiqa-{GOAL_DOC_NUM}/corpus.jsonl", "w") as f:
    for doc in split_docs:
        f.write(json.dumps(doc) + "\n")

test.tsv to test.trec

In [40]:
test_csv = pd.read_csv("/home/guest/r12922050/GitHub/d2qplus/data/fiqa-5000/qrels/test.tsv", sep="\t")
test_trec = []

for idx, row in test_csv.iterrows():
    qid = row["query-id"]
    docno = row["corpus-id"]
    label = row["score"]
    test_trec.append(f"{qid} 0 {docno} {label}")
with open("/home/guest/r12922050/GitHub/d2qplus/data/fiqa-5000/qrels/test.trec", "w") as f:
    for line in test_trec:
        f.write(line + "\n")

## construct few-shot prompt for FiQA

In [4]:
DATASET="fiqa"
import pandas as pd
import json

dev_qrels = pd.read_csv(f"/home/guest/r12922050/GitHub/d2qplus/data/{DATASET}/qrels/dev.tsv", sep="\t")
display(dev_qrels)

with open(f"/home/guest/r12922050/GitHub/d2qplus/data/{DATASET}/corpus.jsonl", "r") as f:
    docs = [json.loads(line) for line in f]
    docid2doc = {doc["_id"]: doc for doc in docs}

with open(f"/home/guest/r12922050/GitHub/d2qplus/data/{DATASET}/queries.jsonl", "r") as f:
    queries = [json.loads(line) for line in f]
    queryid2query = {query["_id"]: query['text'] for query in queries}

print(f"length of docs: {len(docid2doc)}, Example doc: {docs[0]}")
print(f"length of queries: {len(queryid2query)}, Example query: {queries[0]}")

Unnamed: 0,query-id,corpus-id,score
0,1,14255,1
1,2,308938,1
2,3,296717,1
3,3,100764,1
4,3,314352,1
...,...,...,...
1233,11023,419298,1
1234,11023,73239,1
1235,11023,154236,1
1236,11023,532225,1


length of docs: 57600, Example doc: {'_id': '3', 'title': '', 'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.", 'metadata': {}}
length of queries: 6648, Example query: {'_id': '0', 'text': 'What is considered a business expense on a business trip?', 'metadata': {}}


In [10]:
# random pick 8 queries from dev_qrels and corresponding corpus-ids
import random
random.seed(42)  # for reproducibility
sampled_qrels = dev_qrels.sample(n=8, random_state=42)
sampled_qrels = sampled_qrels[["query-id", "corpus-id"]]
sampled_qrels["query-text"] = sampled_qrels["query-id"].apply(lambda x: queryid2query[str(x)])
sampled_qrels["corpus-text"] = sampled_qrels["corpus-id"].apply(lambda x: docid2doc[str(x)]["text"])

# print the sampled_qrels
with open("/home/guest/r12922050/GitHub/d2qplus/prompts/promptagator/fiqa_few_shot_examples.jsonl", "w") as f:
    for index, row in sampled_qrels.iterrows():
        query_id = str(row["query-id"])
        query_text = row["query-text"]
        doc_id = str(row["corpus-id"])
        doc_text = row["corpus-text"]
        example = {
            "query_id": query_id,
            "query_text": query_text,
            "doc_id": doc_id,
            "doc_text": doc_text
        }
        f.write(json.dumps(example) + "\n")

Query ID: 4905, Query Text: what are the pros and cons of structured deposits?
Corpus ID: 316645, Corpus Text: "Say we are in 'normal times.' Passbook rates are 5% or so. Longer rates, 6-7%. I offer you a product with these terms, for $10,000 I will return a ""Guaranteed"" $10,000 in 6 years and based on the stock market, 1% for every 2% the S&P is up beyond 10% at maturity. As the seller of this product, I take $6666, and buy a fixed investment, 6 years at 7% in treasuries will return the $10000. Really. I then take the $3334 and buy out of the money calls on the S&P each year to capture the gains, if any, and to deliver on my promise.  This is one example of a structured deposit offering. They can have nearly any terms one can imagine. Tied to any product. S&P, Crude Oil, Gold. Whatever."

Query ID: 1747, Query Text: Is it bad etiquette to use a credit or debit card to pay for single figure amounts at the POS
Corpus ID: 560557, Corpus Text: Generally, I consider it bad etiquette to inconvenience others.   I would recommend cash for small purchases. Try to offer as close to the required amount as possible.  Don't pay with several dollars worth of change if you can avoid it. You shouldn't need to carry a lot of cash.  When you do don't make it obvious.

Query ID: 6786, Query Text: Should I invest in the pre-IPO company stock offered by my employer?
Corpus ID: 393164, Corpus Text: Should I invest money in the pre-IPO stocks soon to be offered by the   company that I work for? Is it wise to do this? What should I be thinking about? What are the   risks? The last time I was offered pre-IPO friends and family stock, I purchased half of my allotment, and had my parents purchase the other half. Since I had a 6-month blackout period, I had to hold my portion. My parents sold their portion one day after the IPO. The price went up dramatically for about a day and a half, then dived continuously. My portion ended up being worthless. My parents made a few bucks. Good for them. Not a huge deal either way, since my cost was relatively low. If I had a chance to do it again, I'd give it all to friends or family instead of splitting it, and have them sell quickly if they realized a profit. You might be luckier than I was.

Query ID: 7510, Query Text: Should I continue to invest in an S&P 500 index fund?
Corpus ID: 106128, Corpus Text: "You have a good thing going. One of the luxuries of being invested in an index fund for the long term is that you don't have to sweat the inevitable short term dips in the market. Instead, look at the opportunity that presents itself on market dips: now your monthly investment is getting in at a lower price. ""Buy low, sell high."" ""Don't lose money."" These are common mantras for long term investment mentality. 5-8 years is plenty of time -- I'd call it ""medium-term"". As you get closer to your goals (~2-3 years out) you should start slowly moving money out of your index fund and start dollar cost averaging out into cash or short-term bonds (but that's another question). Keep putting money in, wait, and sell high. If it's not high, wait another year or two to buy the house. A lot of people do the opposite for their entire lives: buying high, panic selling on the dips, then buying again when it goes up. That's bad! I recommend a search on ""dollar cost averaging"", which is exactly what you are doing right now with your monthly investments."

Query ID: 7824, Query Text: At what point is the contents of a trust considered to be the property of the beneficiary?
Corpus ID: 210713, Corpus Text: No, you will not have to pay taxes on the corpus (principal) of the trust distribution.  If the trust tax forms were filed correctly, you might have as much as a $9000 loss that will flow to you on the trust's termination.  Previously, the trust was supposed to file a return each year, and either claim the dividends or realized cap gains each year, and pay taxes at trust's rate, or distribute them to the beneficiaries via K-1 form. This is the best way to handle this as the trust has a steep tax table (relative high rates) vs the kiddie tax which would let you get nearly $1K/yr tax free each year as a minor.  During that time, losses net again gains, but can't be 'distributed' to the beneficiary. They are carried forward year to year. In the year the trust is terminated, that loss is not lost, but it's then passed on to the beneficiary, still via K-1. See Schedule K-1 instructions and Schedule K-1 itself.  On a lighter note, the trustee failed you. In the 16 years (Jan 2000-Dec 2015), the market (S&P) grew by 88%, with a compound 4.02%/yr return. Instead of any gain, you got a loss with a -2.75%/yr return. If this were a paid professional, you'd have a potential claim for a lawsuit. This is a reason why amateurs should not be assigned the role of trustee.  To clearly answer the mix of questions you asked - Note - it's always a good idea to seek professional advice. But, the nature of this board is that if any of my answer isn't accurate, a high ranked member (top 20 or so on this list) will likely set me straight within 24 hours.

Query ID: 5996, Query Text: How do I choose between buying a car or buying a plot of land in Pakistan?
Corpus ID: 379189, Corpus Text: “The plot of land definitely is going to give better results in long term.”  Will it? Land is not guaranteed to go up in value. And a car can provide more employment opportunities for you. You need to look at your specific situation—with specific numbers—rather than using rules of thumb as hard guidelines.

Query ID: 6821, Query Text: Is this the right formula to use implied volatility to gauge probability of a stock being within a certain range?
Corpus ID: 331598, Corpus Text: To get the probability of hitting a target price you need a little more math and an assumption about the expected return of your stock.  First let's examine the parts of this expression. IV is the implied volatility of the option.  That means it's the volatility of the underlying that is associated with the observed option price.  As a practical matter, volatility is the standard deviation of returns, expressed in annualized terms.  So if the monthly standard deviation is Y, then Y*SQRT(12) is the volatility. From the above you can see that IV*SQRT(DaysToExpire/356) de-annualizes the volatility to get back to a standard deviation.  So you get an estimate of the expected standard deviation of the return between now and expiration. If you multiply this by the stock price, then you get what you have called X, which is the standard deviation of the dollars gained or lost between now and expiration.  Denote the price change by A (so that the standard deviation of A is X). Note that we seek the expression for the probability of hitting a target level, Q, so mathematically we want 1 - Pr( A < Q - StockPrice)  We do 1 minus the probability of being below this threshold because cumulative distribution functions always find the probability of being BELOW a threshold, not above. If you are using excel and assuming a mean of zero for returns, the probability of hitting or exceeding Q at expiration, then, is That's your answer for the probability of exceeding Q. Accuracy is in the eye of the beholder.  You'd have to specify a criterion by which to judge it to know the answer.  I'm sure more sophisticated methods exist that are more unbiased and have less error, but I think it's a fine first approximation.

Query ID: 806, Query Text: Trying to understand Return on Capital (Joel Greenblatt's Magic Formula version)
Corpus ID: 557607, Corpus Text: Just to clarify things: The Net Working Capital is the funds, the capital that will finance the everyday, the short term, operations of a company like buying raw materials, paying wages erc. So, Net Working Capital doesn't have a negative impact. And you should not see the liabilities as beneficial per se. It's rather the fact that with smaller capital to finance the short term operations the company is able to make this EBIT. You can see it as the efficiency of the company, the smaller the net working capital the more efficient the company is (given the EBIT). I hope you find it helpful, it's my first amswer here. Edit: why do you say the net working capital has a negative impact?
