In [None]:
import openai
import pandas as pd

1 - Provide Path to your API Key

In [None]:
openai_key_path = r'/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Work/Software/Research/nimlab/openai_key.txt'

2 - This will accept the .txt file of abstracts generated from a PubMed search and group every result into a CSV. 

In [None]:
import re
import pandas as pd

class AbstractSeparator:
    def __init__(self, file_path):
        with open(file_path, 'r') as file:
            self.content = file.read()
        self.abstracts = []
    
    def separate_abstracts(self):
        """Separate the content into individual abstracts based on the described pattern."""
        abstract_entries = re.finditer(r'\n(\d+\.\s)', self.content)
        start_positions = [match.start() for match in abstract_entries]
        
        # Create abstract chunks based on the start positions
        abstract_chunks = [self.content[start_positions[i]:start_positions[i + 1]].strip() for i in range(len(start_positions) - 1)]
        abstract_chunks.append(self.content[start_positions[-1]:].strip())
        
        self.abstracts = abstract_chunks
    
    def to_csv(self, output_path):
        """Save the separated abstracts to a CSV."""
        df = pd.DataFrame(self.abstracts, columns=["Abstract"])
        df.to_csv(output_path, index=False)
        
    def get_abstracts(self):
        """Return the list of separated abstracts."""
        return self.abstracts

# Example usage:
# separator = AbstractSeparator("/path/to/your/textfile.txt")
# separator.separate_abstracts()
# separator.to_csv("/path/to/save/csvfile.csv")


3 - Only Evaluate Abstracts with Positive Title Hits (from notebook 00)

In [None]:
class TitleReviewFilter:
    """
    A class to filter abstracts based on title review results.

    Methods:
    - load_data: Loads the title review results and abstracts data.
    - filter_abstracts: Filters the abstracts based on a specified column from the title review results.
    - save_filtered_data: Saves the filtered abstracts to a specified path.
    - get_filtered_dataframe: Returns the filtered abstracts dataframe for visualization.
    """

    def __init__(self, title_review_path, abstracts_path):
        """
        Initializes the TitleReviewFilter class with paths to the title review results and abstracts CSVs.

        Parameters:
        - title_review_path (str): Path to the title review results CSV.
        - abstracts_path (str): Path to the abstracts CSV.
        """
        self.title_review_path = title_review_path
        self.abstracts_path = abstracts_path
        self.title_df, self.abstracts_df = self.load_data()

    def load_data(self):
        """
        Loads the title review results and abstracts data from CSVs.

        Returns:
        - DataFrame, DataFrame: DataFrames containing the title review results and abstracts.
        """
        title_df = pd.read_csv(self.title_review_path)
        abstracts_df = pd.read_csv(self.abstracts_path)
        return title_df, abstracts_df

    def filter_abstracts(self, column_name):
        """
        Filters the abstracts based on a specified column from the title review results.

        Parameters:
        - column_name (str): The column name in the title review results to use for filtering.
        """
        # Find the indices of the rows in title review results where the specified column has a value of 1
        mask_indices = self.title_df[self.title_df[column_name] == 1].index
        # Filter the abstracts dataframe using the mask indices
        self.filtered_df = self.abstracts_df.iloc[mask_indices]

    def save_filtered_data(self, output_path):
        """
        Saves the filtered abstracts to a specified path.

        Parameters:
        - output_path (str): Path to save the filtered abstracts CSV.
        """
        if not output_path.endswith('.csv'):
            output_path += '.csv'
        self.filtered_df.to_csv(output_path, index=False)

    def get_filtered_dataframe(self):
        """
        Returns the filtered abstracts dataframe for visualization.

        Returns:
        - DataFrame: DataFrame containing the filtered abstracts.
        """
        return self.filtered_df


In [None]:
# Define your paths and column name
title_review_path = "/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/raw/review_results.csv"
abstracts_path = "/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/raw/separated_abstracts.csv"
column_name_to_filter = "Passes Title Screen (Very Sensitive)"
output_path_for_filtered_data = "/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/sensitive_masked_abstracts.csv"

In [None]:
import pandas as pd
# Create an instance of the TitleReviewFilter class
title_review_filter = TitleReviewFilter(title_review_path, abstracts_path)

# Filter the abstracts based on the specified column
title_review_filter.filter_abstracts(column_name_to_filter)

# Save the filtered data to a specified path (Optional)
title_review_filter.save_filtered_data(output_path_for_filtered_data)

# Get the filtered dataframe for visualization (Optional)
filtered_df = title_review_filter.get_filtered_dataframe()
filtered_df

4 - Evaluate Abstracts That Passed Title Screen

In [None]:
from tqdm import tqdm
import time 
import pandas as pd
class AbstractEvaluatorDocumented:
    """
    A class to evaluate abstracts from a CSV using the OpenAI API based on a posed question.

    Attributes:
    - api_key (str): OpenAI API key.
    - df (DataFrame): DataFrame containing the abstracts to be evaluated.

    Methods:
    - read_api_key: Reads the OpenAI API key from a file.
    - evaluate_with_openai: Evaluates an abstract based on the posed question using the OpenAI API.
    - evaluate_abstracts: Evaluates all abstracts in the DataFrame based on the posed question.
    - to_csv: Saves the updated DataFrame with the evaluation results to a CSV file.
    - get_dataframe: Returns the updated DataFrame with the evaluation results.
    """

    def __init__(self, api_key_path, csv_path, organization_id=None):
        """
        Initializes the AbstractEvaluatorDocumented class with the path to the API key and the CSV containing the abstracts.

        Parameters:
        - api_key_path (str): Path to the file containing the OpenAI API key.
        - csv_path (str): Path to the CSV containing the abstracts.
        """
        self.api_key = self.read_api_key(api_key_path)
        openai.api_key = self.api_key
        self.organization_id = organization_id
        self.df = pd.read_csv(csv_path)
    
    def read_api_key(self, file_path):
        """
        Reads the OpenAI API key from a file.

        Parameters:
        - file_path (str): Path to the file containing the OpenAI API key.

        Returns:
        - str: OpenAI API key.
        """
        with open(file_path, 'r') as file:
            return file.readline().strip()
    
    def evaluate_with_openai(self, abstract, question):
        """
        Evaluates an abstract based on the posed question using the OpenAI API.

        Parameters:
        - abstract (str): The abstract to be evaluated.
        - question (str): The posed question for evaluation.

        Returns:
        - int: Binary decision (0 or 1) based on the evaluation.
        """
        headers = {}
        if self.organization_id:
            headers['OpenAI-Organization'] = self.organization_id
            
        prompt = f"Abstract: {abstract}\n{question}\n\nResponse (0 for No, 1 for Yes):"
        
        retries = 3
        for _ in range(retries):
            try:
                response = openai.Completion.create(
                                                    engine="davinci",
                                                    prompt=prompt,
                                                    max_tokens=10,
                                                    headers=headers
                                                )
                decision_text = response.choices[0].text.strip()
                decision = 1 if "1" in decision_text else 0
                
                return decision
            except openai.error.OpenAIError as e:
                if "maximum context length" in str(e):
                    return -1
                else:
                    raise e
            except Exception as e:
                if _ < retries - 1:  # i.e. not on the last try yet
                    time.sleep(2)  # wait for 2 seconds before trying again
                else:
                    raise e


    def evaluate_abstracts(self, question):
        """
        Evaluates all abstracts in the DataFrame based on the posed question.

        Parameters:
        - question (str): The posed question for evaluation.
        """
        tqdm.pandas()
        self.df["Evaluation_Result"] = self.df["Abstract"].progress_apply(lambda abstract: self.evaluate_with_openai(abstract, question))
        time.sleep(0.1)
    
    def to_csv(self, output_path):
        """
        Saves the updated DataFrame with the evaluation results to a CSV file.

        Parameters:
        - output_path (str): Path to save the CSV file.
        """
        if not output_path.endswith('.csv'):
            output_path += '.csv'
        self.df.to_csv(output_path, index=False)
    
    def get_dataframe(self):
        """
        Returns the updated DataFrame with the evaluation results.

        Returns:
        - DataFrame: Updated DataFrame containing the evaluation results.
        """
        return self.df


In [None]:
abstracts_path = '/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/sensitive_masked_abstracts.csv'
question_to_gpt = "Do you think this paper has at least one case of a focal lesion causing brain death?"
save_path = '/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/filtered/sensitive_abstract_results.csv'
organization_id = 'org-Y2tKyCPFO6tIjtCtOVZ7c9tr'

In [None]:
# Example usage (commented out for reference):
evaluator = AbstractEvaluatorDocumented(openai_key_path, abstracts_path, organization_id)
evaluator.evaluate_abstracts(question_to_gpt)
results_df = evaluator.get_dataframe()
evaluator.to_csv(save_path)
results_df


Count Positive Abstracts

In [None]:
print(f'Found {results_df["Evaluation_Result"].sum()} positive abstracts.')

Save the Output

In [None]:
evaluator.to_csv("/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/lnm_brain_death/systematic_review/screened_abstracts.csv")