In [None]:
#68%

In [1]:
import os
import pdfplumber
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

def extract_pdf_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def split_into_chunks(text, chunk_size):
    tokens = text.split()
    chunks = [' '.join(tokens[i:i+chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

def find_diseases(chunks, nlp):
    diseases = []
    for chunk in chunks:
        answer = nlp(question="What is the main disease this text is about?", context=chunk)
        diseases.append(answer['answer'])
    return list(set(diseases))

def process_files(input_folder):
    nlp = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", revision="626af31")
    data = []

    for file_name in tqdm(os.listdir(input_folder)):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(input_folder, file_name)
            text = extract_pdf_text(file_path)
            chunks = split_into_chunks(text, 300)
            diseases = find_diseases(chunks, nlp)
            data.append([file_name[:-4], ', '.join(diseases)])

    return data

def create_dataframe(data):
    df = pd.DataFrame(data, columns=['File', 'KW'])
    return df

def save_dataframe(df, output_file):
    df.to_csv(output_file, index=False)

input_folder = '/Users/amin/Desktop/33/test'
output_file = 'KW1.csv'

data = process_files(input_folder)
df = create_dataframe(data)
save_dataframe(df, output_file)


100%|██████████| 31/31 [26:05<00:00, 50.49s/it] 


In [6]:
print(df)
df.to_csv('df1.csv', index=False)

                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             

In [15]:
import configparser
import pandas as pd
import openai
import re

def load_openai_api_key():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["openai"]["api_key"]

def find_matching_area(answer):
    areas = [
        "Vaccines or infectious",
        "Gastrointestinal or metabolism",
        "Neurological",
        "Cancer or tumors or oncology",
        "Dermatological",
        "Organ",
        "Mental",
        "Other",
    ]
    for area in areas:
        if re.search(area.lower(), answer.lower()):
            return area
    return "Other"

def get_related_area(keywords):
    openai.api_key = load_openai_api_key()
    prompt = f"Which of the following areas is more related to the keywords: {keywords}?\n\nAreas:\n" \
             "Vaccines or infectious\n" \
             "Gastrointestinal or metabolism\n" \
             "Neurological\n" \
             "Cancer or tumors or oncology\n" \
             "Dermatological\n" \
             "Organ Health\n" \
             "Mental\n" \
             "Other\n\nAnswer: "

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=20,
        n=1,
        stop=None,
        temperature=0.5,
    )

    answer = response.choices[0].text.strip()
    matching_area = find_matching_area(answer)
    return matching_area

def main():
    # Load your DataFrame
    df = pd.read_csv("df1.csv")

    # Add a new column 'Issue'
    df["Issue"] = df["KW"].apply(get_related_area)

    # Drop the 'KW' column
    df = df.drop(columns=["KW"])
    print(df)

    # Save the DataFrame to a new CSV file
    df.to_csv("issue1.csv", index=False)

if __name__ == "__main__":
    main()


                                                 File  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             