## Mining the synthetic dataset

In [3]:
import pandas as pd
import numpy as np
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
import random
import faiss

In [None]:
# Testing the sentiment analysis on the emotions dataset
analyzer = SentimentIntensityAnalyzer()

with open('Emotions_dataset.csv') as file:
    data = file.read().split('\n')
# Randomly select a journal entry
line = data[random.randint(0, len(data))]
print(line)
line = line.split(',', 2)
jnl_entry = line[2]
sentiment = analyzer.polarity_scores(jnl_entry)
print(sentiment['compound'])


In [None]:
# Testing the embedding model
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding = emb_model.encode(jnl_entry)
print(embedding.shape)

# Creating JSON objects from the dataset

In [None]:
# Reading the data from the csv file.
with open('Emotions_dataset.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for line in data:
    line = line.split(',', 2)
    if not (len(line) < 2):
       jnl_entry = line[2]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + line[0],
           'emotion' : line[1],
           'journal_entry' : line[2],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Storing the embedded data in the vector database

In [4]:
# Testing the vector data storage
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    data = json.load(file)
line = data[0]
embedding = line['embedding']
index.add(np.array([embedding]))

In [9]:
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
embeddings = np.array([entry['embedding'] for entry in jnl_entries])
index.add(embeddings)
def search(query, index, emb_model, jnl_entries, k=10):
    '''This function takes the user's query and returns the top k journal entries that are similar to the query.'''
    query_embedding = np.array(emb_model.encode(query)).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    results = [{'journal_entry':jnl_entries[i]['journal_entry'], 'emotion': jnl_entries[i]['emotion'], 'sentiment_score':jnl_entries[i]['sentiment_score']} for i in indices[0]]
    return results, distances

In [24]:
user_query = "Because ridiculously attractive people also have self esteem issues and depression Guess what honey The world sucks for everyone There’s no escape We will all die in misery and alone"
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
search_results, distances  = search(user_query, index, emb_model, jnl_entries)

# using the index and the retrieved context to generate information

In [1]:
%pip install langchain_community tiktoken langchain-openai langchainhub langchain

Note: you may need to restart the kernel to use updated packages.Collecting langchain_community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting langchain
  Downloading langchain-0.3.17-py3-none-any.whl.metadata (7.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.11.12-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-core<0.4.0,>=0.3.32 (from langch

In [14]:
%pip install langchain_core

Note: you may need to restart the kernel to use updated packages.


In [18]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [32]:
# Testing the generation of the prompt
# List of emotions to choose from
EMOTIONAL_STATES = ["Hopeful", "Anxious", "Inspired", "Overwhelmed", "Peaceful", "Frustrated", "Curious", "Uncertain", "Hopelessness"]
# Prompt template
template = """Give the most relevant emotion to the following journal entry based on the sentiment score and the mapped emotions from the given context.
context: {context}
journal entry: {journal_entry}
Note: only choose from the following emotions: {EMOTIONAL_STATES}"""
# Add the template to the prompt
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 0)
# Combining the prompt and the language model
chain = prompt | llm
response = chain.invoke({'context': search_results, 'journal_entry': user_query, 'EMOTIONAL_STATES': EMOTIONAL_STATES})
# Extracting the emotion from the full response
emotion = response.content.split("**", 2)[1]
emotion

The most relevant emotion for the given journal entry is **Hopelessness**. The tone of the entry expresses a bleak outlook on life and relationships, indicating a sense of despair and resignation.


'Hopelessness'

In [None]:
# Improved version of prompt generation
# TODO: find a way to provide the context to the model with out retrieving it separately