In [None]:
#%51

In [1]:
import os
import pandas as pd
from transformers import pipeline
from pdfminer.high_level import extract_text

def read_pdf(file):
    text = extract_text(file)
    return text

def main_diseases(text):
    nlp = pipeline(
        "question-answering",
        model="distilbert-base-cased-distilled-squad",
        revision="626af31"
    )
    question = "What are the 3 main diseases this company solves?"
    answer = nlp(question=question, context=text)
    return answer['answer']

folder_path = '/Users/amin/Desktop/33/test'
data = []

for file in os.listdir(folder_path):
    if file.endswith('.pdf'):
        file_path = os.path.join(folder_path, file)
        text = read_pdf(file_path)
        diseases = main_diseases(text)
        keywords = diseases.split(', ')
        data.append({'File': file[:-4], 'KW': keywords})

df = pd.DataFrame(data)
df.to_csv('output.csv', index=False)
print(df)


                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             

In [2]:
df.to_csv('df15.csv', index=False)
df

Unnamed: 0,File,KW
0,Zehna Corporate Deck SymBiosis,[stitial fibrosis]
1,PS_TCRR_NOTE_1.5.23,[ovarian cancer]
2,Grace Sciece Platform (9.2.20),[neurodegenerative diseases]
3,AcuamarkDx Series A_Corp Deck,[cancers\n\nProprietary and Confidential - Not...
4,coding.bio_deck,[biopharma]
5,Centurion_BioPharma-Non-Confidential_2021_2,"[immunology, allergy, inflammation, and infect..."
6,JPM_SRZN_NOTE_11.28.22,[tissue damage mediated disorders]
7,G_ACET_IOC_4.8.21,[inflammation-induced cancer progression]
8,C_ARQT_NOTE_11.15.22,[challenges to ARQ-151 IP]
9,B_RXRX_IOC_9.21.21,[cancer vaccines and immunotherapies]


In [3]:
import configparser
import pandas as pd
import openai
import re

def load_openai_api_key():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["openai"]["api_key"]

def find_matching_area(answer):
    areas = [
        "Vaccines or infectious",
        "Gastrointestinal or metabolism",
        "Neurological",
        "Cancer or tumors or oncology",
        "Dermatological",
        "Organ health",
        "Mental",
        "Other",
    ]
    for area in areas:
        if re.search(area.lower(), answer.lower()):
            return area
    return "Other"

def get_related_area(keywords):
    openai.api_key = load_openai_api_key()
    prompt = f"Which of the following areas is more related to the keywords: {keywords}?\n\nAreas:\n" \
             "Vaccines or infectious\n" \
             "Gastrointestinal or metabolism\n" \
             "Neurological\n" \
             "Cancer or tumors or oncology\n" \
             "Dermatological\n" \
             "Organ health\n" \
             "Mental\n" \
             "Other\n\nAnswer: "

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=10,
        n=1,
        stop=None,
        temperature=0.5,
    )

    answer = response.choices[0].text.strip()
    matching_area = find_matching_area(answer)
    return matching_area

def main():
    # Load your DataFrame
    df = pd.read_csv("df15.csv")

    # Add a new column 'Issue'
    df["Issue"] = df["KW"].apply(get_related_area)

    # Drop the 'KW' column
    df = df.drop(columns=["KW"])
    print(df)

    # Save the DataFrame to a new CSV file
    df.to_csv("issue15.csv", index=False)

if __name__ == "__main__":
    main()


                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             