In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
def read_conversations_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        conversations = file.readlines()
    return conversations

def process_text(text):
    return text.replace('Patient: ', '').replace('Doctor: ', '')

def format_data(conversations):
    dialogue_pairs = []

    patient_text = None
    for line in conversations:
        line = line.strip()

        if line.startswith("Patient:"):
            patient_text = process_text(line)
        elif line.startswith("Doctor:") and patient_text is not None:
            doctor_text = process_text(line)
            dialogue_pairs.append([patient_text, doctor_text])

    return dialogue_pairs

# Example usage:
file_path = 'datasets/patient-doctor.txt'
conversations = read_conversations_from_file(file_path)
data = format_data(conversations)
for i,o in data[:3]:
    print('patient:',i)
    print('doctor:',o)
    print()

patient: Hello, Good morning doctor.
doctor: Good morning, how are you feeling today?

patient: I've been feeling quite anxious lately, it's been hard to relax.
doctor: I see. Can you tell me more about what might be causing this anxiety?

patient: I think it's mainly related to my job and the pressure I'm under.
doctor: Stress at work can definitely take a toll on our well-being. Tell me about your job and the specific challenges you're facing.



In [3]:
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    
    # Remove multiple dots
    text = re.sub('\.{2,}', ' ', text)

    # Replace multiple whitespaces with a single space
    text = re.sub('\s+', ' ', text)
    
    # Removing extra characters
    pattern = r'[^A-Za-z0-9.\s]'
    text = re.sub(pattern, '', text)

    # Tokenization
    tokens = word_tokenize(text)
    tokens = tokens.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Rejoin tokens into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

def find_best_match(question):
    best_match = None
    best_match_score = 0

    for pair in data:
        current_score = similarity_score(question, pair[0])
        if current_score > best_match_score:
            best_match = pair
            best_match_score = current_score

    return (
        (best_match[1])
        if best_match
        else "Sorry, I hope it gets better."
    )


def similarity_score(question1, question2):
    question1 = process_text(question1)
    question2 = process_text(question2)
    vectorizer = CountVectorizer().fit_transform([question1, question2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))
    return similarity[0][0]

In [4]:
inp = input("User:")
while inp != '':
    print("Bot:",find_best_match(inp))
    inp = input("User:")

User:Hello
Bot: Good day! How can I assist you today?
User:I am feeling depressed.
Bot: I'm here to assist you. Please share your thoughts.
User:What can I do to get rid of this anxiety
Bot: Healing is possible, and I'm here to guide you towards a brighter and more fulfilling future.
User:okay, thanks.
Bot: It's alright. We'll figure it out together. Please open up.
User:I am feeling sad
Bot: Depression can be challenging, but remember that it's treatable. We'll work on strategies to lift your mood and find joy again.
User:
