In [None]:
#68%

In [1]:
import os
import pdfplumber
import pandas as pd
from transformers import pipeline

# Initialize question-answering pipeline
nlp = pipeline("question-answering")

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to get top 3 disease-related keywords
def get_top_disease_keywords(text):
    questions = [
        "What are the main diseases mentioned in the text?",
        "What are the top diseases discussed?",
        "Which diseases are most important in this document?",
    ]

    answers = [nlp(question=question, context=text) for question in questions]
    keywords = [answer["answer"] for answer in answers]

    return ", ".join(keywords)

# Process PDF files and create a data frame
data = []

pdf_folder = "/Users/amin/Desktop/33/test"
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        pdf_file = os.path.join(pdf_folder, file)
        file_name_without_ext = os.path.splitext(file)[0]
        text = extract_text_from_pdf(pdf_file)
        keywords = get_top_disease_keywords(text)
        data.append({"File": file_name_without_ext, "Issue_KW": keywords})

df = pd.DataFrame(data)

# Save the data frame to a CSV file
df.to_csv("Issue_KW.csv", index=False)


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
df.to_csv('df7.csv', index=False)
df

Unnamed: 0,File,Issue_KW
0,Zehna Corporate Deck SymBiosis,"MDCK kidney epithelial cells, Chronic\nKidney ..."
1,PS_TCRR_NOTE_1.5.23,"mesothelioma, Interest and Other Income, ovari..."
2,Grace Sciece Platform (9.2.20),"Cancer\nViral infections, rat heart, Cancer\nV..."
3,AcuamarkDx Series A_Corp Deck,"cancers, cancers, cancers"
4,coding.bio_deck,"biopharma, biopharma, biopharma"
5,Centurion_BioPharma-Non-Confidential_2021_2,"allergy, inflammation, immunology, allergy, in..."
6,JPM_SRZN_NOTE_11.28.22,"tissue damage mediated\ndisorders, tissue dama..."
7,G_ACET_IOC_4.8.21,"increase in liver transaminase,\nand eosinophi..."
8,C_ARQT_NOTE_11.15.22,"WI-NRS and EASI-75, body IGA success\nCOWEN, c..."
9,B_RXRX_IOC_9.21.21,"pediatric rare genetic disease and\noncology, ..."


In [4]:
import configparser
import pandas as pd
import openai
import re

def load_openai_api_key():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["openai"]["api_key"]

def find_matching_area(answer):
    areas = [
        "Vaccines or infectious",
        "Gastrointestinal or metabolism",
        "Neurological",
        "Cancer or tumors or oncology",
        "Dermatological",
        "Organ health",
        "Mental",
        "Other",
    ]
    for area in areas:
        if re.search(area.lower(), answer.lower()):
            return area
    return "Other"

def get_related_area(keywords):
    openai.api_key = load_openai_api_key()
    prompt = f"Which of the following areas is more related to the keywords: {keywords}?\n\nAreas:\n" \
             "Vaccines or infectious\n" \
             "Gastrointestinal or metabolism\n" \
             "Neurological\n" \
             "Cancer or tumors or oncology\n" \
             "Dermatological\n" \
             "Organ health\n" \
             "Mental\n" \
             "Other\n\nAnswer: "

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=10,
        n=1,
        stop=None,
        temperature=0.5,
    )

    answer = response.choices[0].text.strip()
    matching_area = find_matching_area(answer)
    return matching_area

def main():
    # Load your DataFrame
    df = pd.read_csv("df7.csv")

    # Add a new column 'Issue'
    df["Issue"] = df["Issue_KW"].apply(get_related_area)

    # Drop the 'KW' column
    df = df.drop(columns=["Issue_KW"])
    print(df)

    # Save the DataFrame to a new CSV file
    df.to_csv("issue7.csv", index=False)

if __name__ == "__main__":
    main()


                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             