## Mining the synthetic dataset

In [None]:
import pandas as pd
import numpy as np
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
import random
import faiss

In [None]:
# Testing the sentiment analysis on the emotions dataset
analyzer = SentimentIntensityAnalyzer()

with open('Emotions_dataset.csv') as file:
    data = file.read().split('\n')
# Randomly select a journal entry
line = data[random.randint(0, len(data))]
print(line)
line = line.split(',', 2)
jnl_entry = line[2]
sentiment = analyzer.polarity_scores(jnl_entry)
print(sentiment['compound'])


In [None]:
# Testing the embedding model
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding = emb_model.encode(jnl_entry)
print(embedding.shape)

# Creating JSON objects from the dataset

In [11]:
# Reading the data from the csv file.
with open('Emotions_dataset.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for line in data:
    line = line.split(',', 2)
    if not (len(line) < 2):
       jnl_entry = line[2]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + line[0],
           'emotion' : line[1],
           'journal_entry' : line[2],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Storing the embedded data in the vector database

In [17]:
# Testing the vector data storage
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    data = json.load(file)
line = data[0]
embedding = line['embedding']
index.add(np.array([embedding]))

In [2]:
Dimensions = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
embeddings = np.array([entry['embedding'] for entry in data])
index.add(embeddings)
def search(query, index, emb_model, jnl_entries, k=5):
    '''This function takes the user's query and returns the top k journal entries that are similar to the query.'''
    query_embedding = np.array(emb_model.encode(query)).reshape(1, -1)
    _, indices = index.search(query_embedding, k)
    return [{'journal_entry':jnl_entries[i]['journal_entry'], 'emotion': jnl_entries[i]['emotion']} for i in indices[0]]

NameError: name 'faiss' is not defined

In [None]:
user_query = "I'm feeling really stressed with my workload."
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
search_results = search(user_query, index, emb_model, jnl_entries)
for result in search_results:
    print(result['emotion'])
