In [1]:
import os
import pdfplumber
import pandas as pd
from transformers import pipeline

# Set the path to the folder containing the PDF files
pdf_folder = '/Users/amin/Desktop/33/test'

# Initialize the OpenAI GPT-based question-answering pipeline
question_answering = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")

def get_chunks(text, max_tokens=2048):
    tokens = text.split()
    chunks = []
    current_chunk = []

    for token in tokens:
        if len(current_chunk) + len(token) < max_tokens:
            current_chunk.append(token)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [token]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def find_disease_keywords(chunks, question_answering_pipeline):
    question = "What are the 3 main disease-related keywords?"
    best_score = 0
    best_answer = None

    for chunk in chunks:
        answer = question_answering_pipeline(question=question, context=chunk)
        if answer["score"] > best_score:
            best_score = answer["score"]
            best_answer = answer["answer"]

    return best_answer

results = []

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        with pdfplumber.open(os.path.join(pdf_folder, pdf_file)) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()

            chunks = get_chunks(text)
            disease_keywords = find_disease_keywords(chunks, question_answering)
            results.append({"File": pdf_file[:-4], "Issue_KW": disease_keywords})

df = pd.DataFrame(results)
df.to_csv("Issue_KW.csv", index=False)


In [4]:
df.to_csv('df6.csv', index=False)
df

Unnamed: 0,File,Issue_KW
0,Zehna Corporate Deck SymBiosis,"saline, FMC, ZTx101"
1,PS_TCRR_NOTE_1.5.23,Accelerating next-gen TC-510 and TC-520
2,Grace Sciece Platform (9.2.20),Parkinson’s (1MM patients in U.S.)
3,AcuamarkDx Series A_Corp Deck,Proprietary and Confidential
4,coding.bio_deck,Distance T cell to epitope ● Signaling domains...
5,Centurion_BioPharma-Non-Confidential_2021_2,ology and diagnostics
6,JPM_SRZN_NOTE_11.28.22,2
7,G_ACET_IOC_4.8.21,HLA-independence overcoming loss of HLA
8,C_ARQT_NOTE_11.15.22,Underperform
9,B_RXRX_IOC_9.21.21,Batten disease and CMT2A


In [5]:
import configparser
import pandas as pd
import openai
import re

def load_openai_api_key():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["openai"]["api_key"]

def find_matching_area(answer):
    areas = [
        "Vaccines or infectious",
        "Gastrointestinal or metabolism",
        "Neurological",
        "Cancer or tumors or oncology",
        "Dermatological",
        "Organ health",
        "Mental",
        "Other",
    ]
    for area in areas:
        if re.search(area.lower(), answer.lower()):
            return area
    return "Other"

def get_related_area(keywords):
    openai.api_key = load_openai_api_key()
    prompt = f"Which of the following areas is more related to the keywords: {keywords}?\n\nAreas:\n" \
             "Vaccines or infectious\n" \
             "Gastrointestinal or metabolism\n" \
             "Neurological\n" \
             "Cancer or tumors or oncology\n" \
             "Dermatological\n" \
             "Organ health\n" \
             "Mental\n" \
             "Other\n\nAnswer: "

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=10,
        n=1,
        stop=None,
        temperature=0.5,
    )

    answer = response.choices[0].text.strip()
    matching_area = find_matching_area(answer)
    return matching_area

def main():
    # Load your DataFrame
    df = pd.read_csv("df6.csv")

    # Add a new column 'Issue'
    df["Issue"] = df["Issue_KW"].apply(get_related_area)

    # Drop the 'KW' column
    df = df.drop(columns=["Issue_KW"])
    print(df)

    # Save the DataFrame to a new CSV file
    df.to_csv("issue6.csv", index=False)

if __name__ == "__main__":
    main()


                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             