In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from rapidfuzz import process, fuzz


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [39]:
import os

# Create a list of years from 2000 to 2022
years_list = list(range(2000, 2023))

# Base URL for TREC conference proceedings on the DBLP website
url = "https://dblp.org/db/conf/trec/trec"

trec_count = 9 # Counter for the TREC directory names

url_list = []  # List to store the URLs for each year's proceedings
paths = []     # List to store the file paths of found PDF files
df_rows = []   # List to store the data rows for creating a DataFrame later

# Loop through the years and create the URL list for each year's proceedings
for i in years_list:
    url_list.append(url+str(i)+".html")

trec_titles = []

# Loop through each URL for processing
for i in url_list:
    
    # Extract the year from the URL
    year = re.findall(r'\d{4}', i)
    
    # Send an HTTP request to the URL and parse the HTML response
    resp = requests.get(i)
    soup = BeautifulSoup(resp.text, features="html.parser")

    # Find all 'li' elements with class 'entry inproceedings' (which contain paper info)
    li_elements = soup.find_all('li', class_='entry inproceedings')

    # Loop through each 'li' element to extract paper details
    for li in li_elements:
        # Find the 'href' attribute of the first 'a' element (the paper's URL)
        a_element = li.find('a')
        if a_element:
            href_value = a_element['href']
    
        # Find all 'span' elements that contain the author names
        span_elements = li.find_all('span', attrs={"itemprop": "author", "itemtype": "http://schema.org/Person"})
        authors_list = []
        for span in span_elements:
            name_text = span.get_text() # Extract the author's name text
            authors_list.append(name_text) # Add to the authors list
    
        # Find the 'span' element containing the paper's title
        title_span = li.find('span', class_='title')
        if title_span:
            title_text = title_span.get_text()  # Extract the title text

        # Search for the previous header element (usually 'h2') indicating the category
        header_element = None
        for elem in li.previous_elements:
            if elem.name == 'header' and elem.get('class') == ['h2']:   # Ensure it's an 'h2' with class 'h2'
                header_element = elem
                break

        # If found, extract the header text, otherwise set it as "Uncategorized"
        if header_element:
            header_text = header_element.get_text()
        else:
            header_text = "Uncategorized"
            
        # Define the directory name for TREC proceedings based on the counter
        directory = f"trec{trec_count}"

        # Regex pattern to extract the filename from the URL
        pattern = r'/([^/]+)$'
        match = re.search(pattern, href_value)
        
        filename = match.group(1) # Extract the file name

        # Skip certain files that don't need to be processed
        if filename == "index.htm":
            continue

        # Handle specific filename corrections
        if filename == "BacchinM05.html":
            filename = "upadova.geo.pdf"

        if filename=="WebberSWZOFPDB10.html":
            filename= "univ.melbourne.LEGAL.pdf"

        if int(year[0]) == 2001 and filename == "filtering_track.pdf":
            filename = "filtering2_track.pdf"

        if filename == "Trec10NotebookPrager.pdf":
            filename = "Trec10Prager.pdf"

        if filename == "pro-TJU_CS_IR_cs.pdf":
            filename = "pro-TJU_CS_IR _cs.pdf"
        
        # Base path where the PDF files are located
        base_path = "../../../data/PDF_data/TREC_NIST_Proceedings" 
        
        # Search the directory for the file
        for root, dirs, files in os.walk(base_path):
            if directory in dirs: # Check if the TREC directory exists
                directory_path = os.path.join(root, directory)
                for root_sub, dirs_sub, files_sub in os.walk(directory_path): # Walk through subdirectories
                    file_path = os.path.join(root_sub, filename)  # Construct the full file path
                    if os.path.exists(file_path):  # If the file exists, add its path
                        paths.append(file_path)
                        # Append the extracted data to df_rows for further processing
                        df_rows.append([year[0],href_value, authors_list, title_text, header_text, filename,file_path])

                        break
                else:
                    continue
                break
        else:
            print(f"File '{filename}' not found in '{directory}' or its subdirectories.")
    
    # Increment the TREC directory counter for the next iteration
    trec_count += 1

In [40]:
# Create Dataframe with TREC document metadata from DBLP
df_trec_incomplete = pd.DataFrame(df_rows, columns =["PubYear", "url", "Authors", "Title","Section", "filename", "filepath"])

In [41]:
df_trec_incomplete

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
...,...,...,...,...,...,...,...
1937,2022,https://trec.nist.gov/pubs/trec31/papers/udel_...,"[Fumian Chen, Hui Fang]",An Exploration of Learning-to-re-rank Using a ...,Participant Papers,udel_fang.F.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
1938,2022,https://trec.nist.gov/pubs/trec31/papers/udel_...,"[Dayu Yang, Yue Zhang, Hui Fang]",An Exploration Study of Mixed-initiative Query...,Participant Papers,udel_fang.C.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
1939,2022,https://trec.nist.gov/pubs/trec31/papers/umcp....,"[Suraj Nair, Douglas W. Oard]",Probabilistic Structured Queries: The Universi...,Participant Papers,umcp.N.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
1940,2022,https://trec.nist.gov/pubs/trec31/papers/umcp....,"[Nathaniel W. Rollings, Peter A. Rankel, Dougl...",Multi-Faceted Question Fusion in the TREC 2022...,Participant Papers,umcp.R.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...


In [46]:
temporary_id = []
# Generate temporary IDs using a counter starting from 1
for counter, (i, j) in enumerate(df_trec_incomplete.iterrows(), start=1):
    temporary_id.append(f"trec_{counter}")

# Assign the generated temporary IDs to a new column "ID temp" in the DataFrame
df_trec_incomplete["ID temp"] = temporary_id

In [47]:
df_trec_incomplete

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath,ID temp
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_2
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_3
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_4
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_5
...,...,...,...,...,...,...,...,...
1937,2022,https://trec.nist.gov/pubs/trec31/papers/udel_...,"[Fumian Chen, Hui Fang]",An Exploration of Learning-to-re-rank Using a ...,Participant Papers,udel_fang.F.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1938
1938,2022,https://trec.nist.gov/pubs/trec31/papers/udel_...,"[Dayu Yang, Yue Zhang, Hui Fang]",An Exploration Study of Mixed-initiative Query...,Participant Papers,udel_fang.C.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1939
1939,2022,https://trec.nist.gov/pubs/trec31/papers/umcp....,"[Suraj Nair, Douglas W. Oard]",Probabilistic Structured Queries: The Universi...,Participant Papers,umcp.N.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1940
1940,2022,https://trec.nist.gov/pubs/trec31/papers/umcp....,"[Nathaniel W. Rollings, Peter A. Rankel, Dougl...",Multi-Faceted Question Fusion in the TREC 2022...,Participant Papers,umcp.R.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1941


In [55]:
import os

# Base directory to start the search from
base_dir = '../../../data/PDF_data/TREC_NIST_Proceedings'

# Recursively search the directory for files
file_paths = []
for root, dirs, files in os.walk(base_dir):
    for file_name in files:
        # Construct the full file path
        file_path = os.path.join(root, file_name)
        # Add the file path to the list
        file_paths.append(file_path)

# Output all found file paths
for file_path in file_paths:
    print(file_path)

D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\Alicante__TREC-10_Paper.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\articleUdeM.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\BBNTREC2001.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\berkeley_trec10.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\CASICT.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\CLARIT_TREC-2001_Filtering_Final.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\clips-imag-bin.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\clirtrack.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\clresearch-t10.pdf
D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\cmu-dir-le

In [57]:
# Find PDFs that were just found in DBLP
only_found_on_dblp = list(set(paths) - set(file_paths))

# Find PDFs that were just found on local directory of PDFs downloaded from the TREC iniative website
only_found_on_locally = list(set(file_paths) - set(paths))

print(f"Strings, die nur in list1 vorkommen: {only_found_on_dblp}")
print(f"Strings, die nur in list2 vorkommen: {only_found_on_locally}")

Strings, die nur in list1 vorkommen: []
Strings, die nur in list2 vorkommen: ['D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec25\\Participant\\AKSW-QA.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec11\\tsinghuau.web2.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec28\\Overview\\OVERVIEW.D.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec21\\Participant\\uog_tw.microblog.final.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec10\\web2001.ps.gz', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec25\\Participant\\CLIP-QA-RT.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec27\\Participant\\anserini-CTR-CC-N.pdf', 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec19\\Participant\\york.univ.chem.pdf', 'D:/Studium/Mast

In [64]:
extra_rows = []

# Iterate over each file path that was not found on DBLP
for i in only_found_on_locally:
    # Print the file path for review
    print(i, "\n")

    # Define a regex pattern to extract the TREC number from the file path
    pattern = r'(trec(\d{2}))'
    match = re.search(pattern, i)

    # Extract the TREC number from the matched group and convert it to an integer
    number = int(match.group(2)) 
    year = number - 9

    # Define a regex pattern to extract the file name from the file path
    pattern_filename = r'.*\\([^\\]+)$'
    match_filename = re.search(pattern_filename, i)

    # Construct the URL based on the extracted TREC number and file name
    url = f"https://trec.nist.gov/pubs/{match.group(1)}/papers/{match_filename.group(1)}"
    print(url)

     # Prompt the user to input a new URL or use the generated one
    url_new = input()
    if url_new == "+":
        url = url
    else:
        url = url_new
        
    # Prompt the user to input authors and split the input into a list
    authors_input = input()
    authors = [name.strip() for name in authors_input.split(',')]

    # Prompt the user to input the title of the document
    title = input()

     # Determine the section based on the presence of certain keywords in the file path
    if "Participant" in i:
        section = "Participant"
    elif "Overview" in i:
        section = "Overview"
    else:
        section = "Uncategorized"

    # Append the collected data as a new row to the extra_rows list
    extra_rows.append([year, url, authors, title, section, match_filename.group(1), i])

D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec25\Participant\AKSW-QA.pdf 

https://trec.nist.gov/pubs/trec25/papers/AKSW-QA.pdf


 +
 Edgard Marx, Sandro Coelho
 Answering Live Questions from Heterogeneous Data Sources SMART in Live QA at TREC 2016


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec11\tsinghuau.web2.pdf 

https://trec.nist.gov/pubs/trec11/papers/tsinghuau.web2.pdf


 +
 Min Zhang, Ruihua Song, Chuan Lin, Shaoping Ma, Zhe Jiang, Yijiang Jin, Yiqun Liu, Le Zhao
 THU TREC2002 Web Track Experiments*


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.D.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.D.pdf


 +
 Mustafa Abualsaud, Christina Lioma, Maria Maistro, Mark D. Smucker, Guido Zuccon
 Overview of the TREC 2019 Decision Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec21\Participant\uog_tw.microblog.final.pdf 

https://trec.nist.gov/pubs/trec21/papers/uog_tw.microblog.final.pdf


 +
 Jesus A. Rodriguez Perez, Andrew J. McMinn, Joemon M. Jose
 University of Glasgow (uog_tw) at TREC Microblog 2012


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec10\web2001.ps.gz 

https://trec.nist.gov/pubs/trec10/papers/web2001.ps.gz


 +
 David Hawking, Nick Craswell
 Overview of the TREC-2001 Web Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec25\Participant\CLIP-QA-RT.pdf 

https://trec.nist.gov/pubs/trec25/papers/CLIP-QA-RT.pdf


 +
 Mossaab Bagdouri, Douglas W. Oard
 CLIP at TREC 2016: LiveQA and RTS


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec27\Participant\anserini-CTR-CC-N.pdf 

https://trec.nist.gov/pubs/trec27/papers/anserini-CTR-CC-N.pdf


 +
 Peilin Yang, Jimmy Lin
 Anserini at TREC 2018: CENTRE, Common Core, and News Tracks


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec19\Participant\york.univ.chem.pdf 

https://trec.nist.gov/pubs/trec19/papers/york.univ.chem.pdf


 +
 Jiashu Zhao, Xiangji Huang, Zheng Ye
 York University at TREC 2010: Chemical Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec31\Participant\NLE.D.pdf 

https://trec.nist.gov/pubs/trec31/papers/NLE.D.pdf


 +
 Carlos Lassance, Stephane Clinchant
 Naver Labs Europe (SPLADE) @ TREC Deep Learning 2022


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Participant\h2oloo.DL.pdf 

https://trec.nist.gov/pubs/trec28/papers/h2oloo.DL.pdf


 +
 Zeynep Akkalyoncu Yilmaz, Shengjin Wang, Jimmy Lin
 H2oloo at TREC 2019: Combining Sentence and Document Evidence in the Deep Learning Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec27\Overview\Overview-CENTRE.pdf 

https://trec.nist.gov/pubs/trec27/papers/Overview-CENTRE.pdf


 +
 Ian Soboroff, Nicola Ferro, Maria Maistro, Tetsuya Sakai
 Overview of the TREC 2018 CENTRE Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.FR.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.FR.pdf


 +
 Asia J. Biega, Fernando Diaz, Michael D. Ekstrand, Sebastian Kohlmeier
 Overview of the TREC 2019 Fair Ranking Track∗


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec27\Participant\BiTeM-PM.pdf 

https://trec.nist.gov/pubs/trec27/papers/BiTeM-PM.pdf


 +
 Emilie Pasche, Paul van Rijen, Julien Gobeill, Anaïs Mottaz, Luc Mottin, Patrick Ruch
 SIB Text Mining at TREC 2018 Precision Medicine Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec27\Participant\DLR_DW_BWS-IS.pdf 

https://trec.nist.gov/pubs/trec27/papers/DLR_DW_BWS-IS.pdf


 +
 Anna Kruspe, Jens Kersten, Matti Wiegmann, Benno Stein, Friederike Klan
 Classification of Incident-related Tweets: Tackling Imbalanced Training Data using Hybrid CNNs and Translation-based Data Augmentation


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.N.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.N.pdf


 +
 Ian Soboroff, Shudong Huang, Donna Harman
 TREC 2019 News Track Overview


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec25\Participant\IRIT-RT.pdf 

https://trec.nist.gov/pubs/trec25/papers/IRIT-RT.pdf


 +
 Bilel Moulahi, Lamjed Ben Jabeur, Abdelhamid Chellal, Thomas Palmer, Lynda Tamine, Mohand Boughanem, Karen Pinel-Sauvagnat, Gilles Hubert
 IRIT at TREC Real Time Summarization 2016


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec26\Participant\IRLAB_DA-IICT-RT.pdf 

https://trec.nist.gov/pubs/trec26/papers/IRLAB_DA-IICT-RT.pdf


 +
 Sandip Modha, Chintak Mandalia, Shyamal Shahshah, Sahil Kewlani, Bhavya Shah, Deep Doshi, Prasenjit Majumder
 DAIICT-LDRP at TREC RTS 2017: Real Time Push Notification and Post Summarization


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Participant\CLAC_NEWS.News.pdf 

https://trec.nist.gov/pubs/trec28/papers/CLAC_NEWS.News.pdf


 +
 Pavel Khloponin, Leila Kosseim
 The CLaC System at the TREC 2019 News Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec21\Participant\UoE.session.final.pdf 

https://trec.nist.gov/pubs/trec21/papers/UoE.session.final.pdf


 +
 M-Dyaa Albakour, Udo Kruschwitz
 University of Essex at the TREC 2012 Session Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Participant\UAGPLSI.IS.pdf 

https://trec.nist.gov/pubs/trec28/papers/UAGPLSI.IS.pdf


 +
 Javi Fernández, Fernando Llopis, Patricio Martínez-Barco, José M. Gómez
 GPLSI at TREC 2019 Incident Streams Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.CAsT.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.CAsT.pdf


 +
 Jeffrey Dalton, Chenyan Xiong, Jamie Callan
 CAsT 2019: The Conversational Assistance Track Overview


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec25\Participant\SCIAICLTeam-CS.pdf 

https://trec.nist.gov/pubs/trec25/papers/SCIAICLTeam-CS.pdf


 Tristan Canova, Daniel Carpenter, Kevin Danaher, Neil Devine, Darren Lim
 Siena College’s Institute of Artificial Intelligence TREC 2016 Contextual Suggestion Track
 -


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.DL.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.DL.pdf


 +
 Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, Ellen M. Voorhees
 Overview of the TREC 2019 Deep Learning Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec25\Participant\IRIT-CL.pdf 

https://trec.nist.gov/pubs/trec25/papers/IRIT-CL.pdf


 +
 Gia-Hung Nguyen, Laure Soulier, Lynda Tamine, Nathalie Bricon-Souf
 IRIT @ TREC 2016 Clinical Decision Support Track


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec21\Participant\UTAustin.crowd.final.pdf 

https://trec.nist.gov/pubs/trec21/papers/UTAustin.crowd.final.pdf


 +
 Hyun Joon Jung, Matthew Lease
 UT Austin in the TREC 2012 Crowdsourcing Track’s Image Relevance Assessment Task


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec28\Overview\OVERVIEW.CAR.pdf 

https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.CAR.pdf


 +
 Laura Dietz, John Foley
 TREC CAR Y3: Complex Answer Retrieval Overview


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec16\Participant\umelbourne.ngoc-ahn.MQ.final.pdf 

https://trec.nist.gov/pubs/trec16/papers/umelbourne.ngoc-ahn.MQ.final.pdf


 +
 William Webber, Vo Ngoc Anh, Alistair Moffat
 The University of Melbourne in the Million Query Track of TREC 2007


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec29\Overview\OVERVIEW.FR.pdf 

https://trec.nist.gov/pubs/trec29/papers/OVERVIEW.FR.pdf


 +
 Asia J. Biega, Fernando Diaz, Michael D. Ekstrand, Sergey Feldman, Sebastian Kohlmeier
 Overview of the TREC 2020 Fair Ranking Track∗


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec26\Participant\NOVASearch-PM.pdf 

https://trec.nist.gov/pubs/trec26/papers/NOVASearch-PM.pdf


 +
 Gonçalo Araújo, André Mourão, João Magalhães
 NOVASearch at Precision Medicine 2017


D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\trec21\Participant\IBM.microblog.final.pdf 

https://trec.nist.gov/pubs/trec21/papers/IBM.microblog.final.pdf


 +
 Myle Ott, Vittorio Castelli, Hema Raghavan, Radu Florian
 IBM at TREC 2012: Microblog Track


In [69]:
extra_rows

[[16,
  'https://trec.nist.gov/pubs/trec25/papers/AKSW-QA.pdf',
  ['Edgard Marx', 'Sandro Coelho'],
  'Answering Live Questions from Heterogeneous Data Sources SMART in Live QA at TREC 2016',
  'Participant',
  'A',
  'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec25\\Participant\\AKSW-QA.pdf'],
 [2,
  'https://trec.nist.gov/pubs/trec11/papers/tsinghuau.web2.pdf',
  ['Min Zhang',
   'Ruihua Song',
   'Chuan Lin',
   'Shaoping Ma',
   'Zhe Jiang',
   'Yijiang Jin',
   'Yiqun Liu',
   'Le Zhao'],
  'THU TREC2002 Web Track Experiments*',
  'Uncategorized',
  't',
  'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec11\\tsinghuau.web2.pdf'],
 [19,
  'https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.D.pdf',
  ['Mustafa Abualsaud',
   'Christina Lioma',
   'Maria Maistro',
   'Mark D. Smucker',
   'Guido Zuccon'],
  'Overview of the TREC 2019 Decision Track',
  'Overview',
  'O',
  'D:/Studium/Masterarbeit/PythonProject/data/PDF_

In [71]:
for i in extra_rows:
    i[0] = i[0] + 2000

In [74]:
for i in extra_rows:
    
    # Define a regex pattern to extract the file name from the file path
    pattern_filename = r'.*\\([^\\]+)$'
    match_filename = re.search(pattern_filename, i[6])
    
    # Update the file name in the row (index 5) with the extracted file name
    i[5] = match_filename.group(1)


In [75]:
extra_rows

[[2016,
  'https://trec.nist.gov/pubs/trec25/papers/AKSW-QA.pdf',
  ['Edgard Marx', 'Sandro Coelho'],
  'Answering Live Questions from Heterogeneous Data Sources SMART in Live QA at TREC 2016',
  'Participant',
  'AKSW-QA.pdf',
  'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec25\\Participant\\AKSW-QA.pdf'],
 [2002,
  'https://trec.nist.gov/pubs/trec11/papers/tsinghuau.web2.pdf',
  ['Min Zhang',
   'Ruihua Song',
   'Chuan Lin',
   'Shaoping Ma',
   'Zhe Jiang',
   'Yijiang Jin',
   'Yiqun Liu',
   'Le Zhao'],
  'THU TREC2002 Web Track Experiments*',
  'Uncategorized',
  'tsinghuau.web2.pdf',
  'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/TREC_NIST_Proceedings\\trec11\\tsinghuau.web2.pdf'],
 [2019,
  'https://trec.nist.gov/pubs/trec28/papers/OVERVIEW.D.pdf',
  ['Mustafa Abualsaud',
   'Christina Lioma',
   'Maria Maistro',
   'Mark D. Smucker',
   'Guido Zuccon'],
  'Overview of the TREC 2019 Decision Track',
  'Overview',
  'OVERVIEW.D.pdf',
  '

In [79]:
dataframe_addition = pd.DataFrame(extra_rows, columns=list(df_trec_incomplete.columns)[:-1])


In [80]:
dataframe_addition

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath
0,2016,https://trec.nist.gov/pubs/trec25/papers/AKSW-...,"[Edgard Marx, Sandro Coelho]",Answering Live Questions from Heterogeneous Da...,Participant,AKSW-QA.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
1,2002,https://trec.nist.gov/pubs/trec11/papers/tsing...,"[Min Zhang, Ruihua Song, Chuan Lin, Shaoping M...",THU TREC2002 Web Track Experiments*,Uncategorized,tsinghuau.web2.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
2,2019,https://trec.nist.gov/pubs/trec28/papers/OVERV...,"[Mustafa Abualsaud, Christina Lioma, Maria Mai...",Overview of the TREC 2019 Decision Track,Overview,OVERVIEW.D.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
3,2012,https://trec.nist.gov/pubs/trec21/papers/uog_t...,"[Jesus A. Rodriguez Perez, Andrew J. McMinn, J...",University of Glasgow (uog_tw) at TREC Microbl...,Participant,uog_tw.microblog.final.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
4,2001,https://trec.nist.gov/pubs/trec10/papers/web20...,"[David Hawking, Nick Craswell]",Overview of the TREC-2001 Web Track,Uncategorized,web2001.ps.gz,D:/Studium/Masterarbeit/PythonProject/data/PDF...
5,2016,https://trec.nist.gov/pubs/trec25/papers/CLIP-...,"[Mossaab Bagdouri, Douglas W. Oard]",CLIP at TREC 2016: LiveQA and RTS,Participant,CLIP-QA-RT.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
6,2018,https://trec.nist.gov/pubs/trec27/papers/anser...,"[Peilin Yang, Jimmy Lin]","Anserini at TREC 2018: CENTRE, Common Core, an...",Participant,anserini-CTR-CC-N.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
7,2010,https://trec.nist.gov/pubs/trec19/papers/york....,"[Jiashu Zhao, Xiangji Huang, Zheng Ye]",York University at TREC 2010: Chemical Track,Participant,york.univ.chem.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
8,2022,https://trec.nist.gov/pubs/trec31/papers/NLE.D...,"[Carlos Lassance, Stephane Clinchant]",Naver Labs Europe (SPLADE) @ TREC Deep Learnin...,Participant,NLE.D.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...
9,2019,https://trec.nist.gov/pubs/trec28/papers/h2olo...,"[Zeynep Akkalyoncu Yilmaz, Shengjin Wang, Jimm...",H2oloo at TREC 2019: Combining Sentence and Do...,Participant,h2oloo.DL.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...


In [81]:
# Combine metadata from DBLP and manually added metadata
df_trec_complete = pd.concat([df_trec_incomplete, dataframe_addition], axis=0)

In [92]:
def replace_backslashes(path):
    return re.sub(r'\\', '/', path)

# Correct filepaths from double backlash to single slash
df_trec_complete['filepath'] = df_trec_commplete['filepath'].apply(replace_backslashes)

In [106]:
df_trec_complete = df_trec_commplete.drop(columns="ID temp")

In [None]:
permanent_id = []
# Generate temporary IDs using a counter starting from 1
for counter, (i, j) in enumerate(df_trec_complete.iterrows(), start=1):
    permanent_id.append(f"trec_{counter}")

# Assign the generated permanent IDs to a new column "ID" in the DataFrame
df_trec_complete["ID"] = permanent_id

In [104]:
for index, row in df_trec_complete.iterrows():
    file_path = row['filepath']
    if 'rmit.ps' not in file_path:
        df_trec_commplete.at[index, 'filepath'] = file_path.replace('.ps.gz', '.pdf').replace('.ps', '.pdf')

In [111]:
combined_df['PubYear'] = combined_df['PubYear'].astype(int)

In [112]:
combined_df.to_parquet("../../../data/metadata_TREC.parquet")