In [1]:
import os
import pandas as pd

folder_path = "/Users/avyaahuja/doctor/dataset/"

category_mapping = {
    "RES": "respiratory",
    "MSK": "musculoskeletal",
    "CAR": "cardiac",
    "DER": "dermatological",
    "GAS": "gastrointestinal"
}

def preprocess_all_conversations(folder_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  
            prefix = filename.split('.')[0][:3] 
            category = category_mapping.get(prefix, "Unknown") 
            
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding='ISO-8859-1') as file: 
                conversations = file.readlines()

                doctor = ""
                patient = ""
                conversation_pairs = []

                for line in conversations:
                    line = line.strip()

                    if line.startswith("D:"):
                        doctor = line.replace("D:", "").strip()

                    elif line.startswith("P:"):
                        patient = line.replace("P:", "").strip()

                        if doctor and patient:
                            conversation_pairs.append({
                                "type": category,
                                "doctor": doctor,
                                "patient": patient
                            })
                            doctor = ""  
                            patient = ""  

                data.extend(conversation_pairs)

    return pd.DataFrame(data)

df = preprocess_all_conversations(folder_path)

df.to_csv("/Users/avyaahuja/doctor/processed.csv", index=False)
df.head()


Unnamed: 0,type,doctor,patient
0,respiratory,How may I help you?,"Hi, umm, so I've had a sore throat for the pas..."
1,respiratory,"Yeah, for sure. So you said that the sore thro...","Neither, it's been the same."
2,respiratory,"OK, is it painful to swallow food or liquids?","It's painful to swallow solids, yeah."
3,respiratory,"OK, uh, and how is your, umm, have you had any...","No, not really. No."
4,respiratory,OK. And have you noticed any like neck swelling?,No.


In [2]:
resp_df = df[df['type'] == 'respiratory']
musc_df = df[df['type'] == 'musculoskeletal']
cardiac_df = df[df['type'] == 'cardiac']
derm_df = df[df['type'] == 'dermatological']
gastro_df = df[df['type'] == 'gastrointestinal']

In [3]:
print(resp_df.shape,musc_df.shape,cardiac_df.shape,derm_df.shape,gastro_df.shape)

(10057, 3) (2478, 3) (228, 3) (40, 3) (237, 3)


In [4]:
resp_df.to_csv("/Users/avyaahuja/doctor/resp.csv", index=False)
musc_df.to_csv("/Users/avyaahuja/doctor/musc.csv", index=False)
cardiac_df.to_csv("/Users/avyaahuja/doctor/cardiac.csv", index=False)
derm_df.to_csv("/Users/avyaahuja/doctor/derm.csv", index=False)
gastro_df.to_csv("/Users/avyaahuja/doctor/gastro.csv", index=False)

In [4]:
# from langchain_community.document_loaders import CSVLoader

# loader = CSVLoader(file_path='processed.csv')
# docs = loader.load()


In [5]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path='gastro.csv')
gastro_docs = loader.load()

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
chunk_docs=text_splitter.split_documents(gastro_docs)
chunk_docs

[Document(metadata={'source': 'gastro.csv', 'row': 0}, page_content="type: gastrointestinal\ndoctor: What brings you in today?\npatient: I've just been feeling like very nauseated for, it feels like all the time right now."),
 Document(metadata={'source': 'gastro.csv', 'row': 1}, page_content="type: gastrointestinal\ndoctor: When did this start?\npatient: It's been over a week. Maybe not quite two weeks, but like around then. Yeah, maybe like 9 days."),
 Document(metadata={'source': 'gastro.csv', 'row': 2}, page_content="type: gastrointestinal\ndoctor: And um, is it, do you always have the sensation of nausea or is it related, or does it come and go?\npatient: Oh, I think it's like worse when I am smelling something really bad, and it's worse in the morning. But I feel like it's always kind of there."),
 Document(metadata={'source': 'gastro.csv', 'row': 3}, page_content="type: gastrointestinal\ndoctor: I see OK. Um, have you had any vomiting?\npatient: Uh, yeah, like um, yeah, like a l

In [10]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
# chunk_docs=text_splitter.split_documents(docs)
# chunk_docs

[Document(metadata={'source': 'gastro.csv', 'row': 0}, page_content="type: gastrointestinal\ndoctor: What brings you in today?\npatient: I've just been feeling like very nauseated for, it feels like all the time right now."),
 Document(metadata={'source': 'gastro.csv', 'row': 1}, page_content="type: gastrointestinal\ndoctor: When did this start?\npatient: It's been over a week. Maybe not quite two weeks, but like around then. Yeah, maybe like 9 days."),
 Document(metadata={'source': 'gastro.csv', 'row': 2}, page_content="type: gastrointestinal\ndoctor: And um, is it, do you always have the sensation of nausea or is it related, or does it come and go?\npatient: Oh, I think it's like worse when I am smelling something really bad, and it's worse in the morning. But I feel like it's always kind of there."),
 Document(metadata={'source': 'gastro.csv', 'row': 3}, page_content="type: gastrointestinal\ndoctor: I see OK. Um, have you had any vomiting?\npatient: Uh, yeah, like um, yeah, like a l

In [8]:
gastro_df.shape

(237, 3)

In [10]:
# df.shape

(13078, 3)

In [10]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

db=FAISS.from_documents(gastro_docs,OllamaEmbeddings())
db

KeyboardInterrupt: 