In [13]:
import pandas as pd

# Load abstracts.csv into a DataFrame
df = pd.read_csv("pubmed_articles_first_9999.csv")

# Display the rows of the DataFrame
df

Unnamed: 0,PMID,Title,Abstract,Publication Year
0,38336420,The impact of austerity on children: Uncoverin...,Which children are most vulnerable when their ...,2024
1,38333244,Artificial intelligence-powered intraoperative...,No abstract available,2024
2,38333196,Helix-based screening with structure predictio...,The rapid development of drugs against emergin...,2024
3,38333180,Role of robotics and artificial intelligence i...,Artificial intelligence or AI may be identifie...,2023
4,38333148,Relationship between spiritual intelligence an...,The COVID-19 pandemic has caused physical and ...,2023
...,...,...,...,...
9992,37828543,Construction and effect evaluation of predicti...,This study intends to build an artificial inte...,2023
9993,37828481,Prevalence and associated risk factors for chr...,The global prevalence of chronic kidney diseas...,2023
9994,37828414,Effects of high-frequency transcranial magneti...,High-frequency rTMS has been widely used to im...,2023
9995,37828411,Application and prospects of AI-based radiomic...,Artificial intelligence (AI)-based radiomics h...,2023


In [14]:
def text_retrieval(query, corpus_df):
    """
    Perform text retrieval based on a user query against the document corpus DataFrame.

    Args:
        query (str): The user query.
        corpus_df (pandas.DataFrame): The DataFrame containing the document corpus.

    Returns:
        pandas.DataFrame: Subset of the corpus DataFrame containing relevant documents.
    """
    # Convert the query to lowercase for case-insensitive matching
    query = query.lower()

    # Filter the corpus DataFrame based on the query matching in Title or Abstract columns
    relevant_documents = corpus_df[corpus_df['Title'].str.lower().str.contains(query, na=False) |
                                   corpus_df['Abstract'].str.lower().str.contains(query, na=False)]

    return relevant_documents

In [21]:
def perform_faceted_search(start_year, end_year, corpus_df):
    """
    Perform faceted search based on publication year range against the document corpus DataFrame.

    Args:
        start_year (int): The start year of the publication time range.
        end_year (int): The end year of the publication time range.
        corpus_df (pandas.DataFrame): The DataFrame containing the document corpus.

    Returns:
        pandas.DataFrame: Subset of the corpus DataFrame containing relevant documents.
    """
    # Filter out rows with invalid or missing publication years
    corpus_df = corpus_df[corpus_df['Publication Year'].str.isdigit()]

    # Convert 'Publication Year' column to integer type
    corpus_df['Publication Year'] = corpus_df['Publication Year'].astype(int)

    # Filter the corpus DataFrame based on the publication year range
    relevant_documents = corpus_df[(corpus_df['Publication Year'] >= start_year) &
                                   (corpus_df['Publication Year'] <= end_year)]
    return relevant_documents

In [22]:
# Example usage:
start_year = 2020
end_year = 2022
faceted_search_result = perform_faceted_search(start_year, end_year, df)

# Display the faceted search result
print("Faceted Search Result:")
print(faceted_search_result)

Faceted Search Result:
          PMID                                              Title  \
237   38264472  Hybrid AC/DC architecture in the CE.D.E.R.-CIE...   
1424  38148789  Generalizability of an acute kidney injury pre...   
2467  38106819  Development of an interpretable machine learni...   
2681  38098756  Prediction of self-efficacy in recognizing dee...   
2884  38091509  AlcoR: alignment-free simulation, mapping, and...   
2885  38091508  MLcps: machine learning cumulative performance...   
2941  38089082  Applications of digital Medicine in oncology: ...   
4038  38046867  Geolocated Twitter-based population mobility i...   
4042  38046690  Venous thromboembolism prophylaxis and related...   
4073  38045518  A microcosting analysis of ICU expenditure in ...   
4283  38037555  Predictors of face mask use during the COVID-1...   
5286  38000912  Evaluating long-read de novo assembly tools fo...   
5474  37994356  FLEXGRID - A novel smart grid architecture tha...   
7514  37920

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df['Publication Year'] = corpus_df['Publication Year'].astype(int)
