## Mining the synthetic dataset

In [None]:
import pandas as pd
import numpy as np
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
import random
import faiss

In [None]:
# Testing the sentiment analysis on the emotions dataset
analyzer = SentimentIntensityAnalyzer()

with open('Emotions_dataset.csv') as file:
    data = file.read().split('\n')
# Randomly select a journal entry
line = data[random.randint(0, len(data))]
print(line)
line = line.split(',', 2)
jnl_entry = line[2]
sentiment = analyzer.polarity_scores(jnl_entry)
print(sentiment['compound'])


In [None]:
# Testing the embedding model
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding = emb_model.encode(jnl_entry)
print(embedding.shape)

# Creating JSON objects from the dataset

In [None]:
# Reading the data from the csv file.
with open('Emotions_dataset.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for line in data:
    line = line.split(',', 2)
    if not (len(line) < 2):
       jnl_entry = line[2]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + line[0],
           'emotion' : line[1],
           'journal_entry' : line[2],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Storing the embedded data in the vector database

In [None]:
# Testing the vector data storage
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    data = json.load(file)
line = data[0]
embedding = line['embedding']
index.add(np.array([embedding]))

In [None]:
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
embeddings = np.array([entry['embedding'] for entry in jnl_entries])
index.add(embeddings)

def search(query, index, emb_model, jnl_entries, k=10):
    '''This function takes the user's query and returns the top k journal entries that are similar to the query.'''
    query_embedding = np.array(emb_model.encode(query)).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    results = [{'journal_entry':jnl_entries[i]['journal_entry'], 'emotion': jnl_entries[i]['emotion'], 'sentiment_score':jnl_entries[i]['sentiment_score']} for i in indices[0]]
    return results, distances

In [None]:
user_query = "Because ridiculously attractive people also have self esteem issues and depression Guess what honey The world sucks for everyone There’s no escape We will all die in misery and alone"
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
search_results, distances  = search(user_query, index, emb_model, jnl_entries)

# using the index and the retrieved context to generate information

In [None]:
%pip install langchain_community tiktoken langchain-openai langchainhub langchain

In [None]:
%pip install langchain_core

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
# Testing the generation of the prompt
# List of emotions to choose from
EMOTIONAL_STATES = ["Hopeful", "Anxious", "Inspired", "Overwhelmed", "Peaceful", "Frustrated", "Curious", "Uncertain", "Hopelessness"]
# Prompt template
template = """Give the most relevant emotion to the following journal entry based on the sentiment score and the mapped emotions from the given context.
context: {context}
journal entry: {journal_entry}
Note: only choose from the following emotions and only output that emotion: {EMOTIONAL_STATES}"""
# Add the template to the prompt
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 0)
# Combining the prompt and the language model
chain = prompt | llm
response = chain.invoke({'context': search_results, 'journal_entry': user_query, 'EMOTIONAL_STATES': EMOTIONAL_STATES})
# Extracting the emotion from the full response
emotion = response.content
emotion

In [None]:
# Improved version of prompt generation
# TODO: find a way to provide the context to the model with out retrieving it separately
# DIMENSIONS = 384
# Creating an instance of the faiss index.
# index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
documents = [Document(page_content=entry['journal_entry']) for entry in jnl_entries]
# embeddings = np.array([entry['embedding'] for entry in jnl_entries])
emb_model = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# index.add(embeddings)
vector_store = FAISS.from_documents(documents=documents, embedding=emb_model)
vector_store.save_local('emotions_vector_store')

In [None]:
loaded_vectors = FAISS.load_local('emotions_vector_store', embeddings=emb_model, allow_dangerous_deserialization=True)
retriever = loaded_vectors.as_retriever(k=10)
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 0)
rag_chain = (  
    prompt
    | llm
    | StrOutputParser()
)
rag_chain.invoke({'context': retriever, 'journal_entry': user_query, 'EMOTIONAL_STATES': EMOTIONAL_STATES})


### Only run the code above if absolutely necessary, and only do so selectively!!!

# Cleaning and preprocessing the new dataset

In [41]:
import pandas as pd
import plotly.express as px
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer

In [35]:
df = pd.read_csv('tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [36]:
df.drop(columns=['tweet_id'], inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39173 entries, 1 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  39173 non-null  object
 1   content    39173 non-null  object
dtypes: object(2)
memory usage: 918.1+ KB


In [11]:
df.sentiment.unique()

array(['sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love',
       'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
df[df['sentiment'] == 'empty'].count()

sentiment    827
content      827
dtype: int64

In [37]:
df.drop(df[df['sentiment'] == 'empty'].index, inplace=True)

In [18]:
emotions = df['sentiment'].unique()
for emotion in emotions:
    print(emotion, end=': ')
    count = df[df['sentiment'] == emotion].count().values[0]
    print(count)

sadness: 5165
enthusiasm: 759
neutral: 8638
worry: 8459
surprise: 2187
love: 3842
fun: 1776
hate: 1323
happiness: 5209
boredom: 179
relief: 1526
anger: 110


In [23]:
emotions_fig = px.bar(
    x=emotions,
    y=[df[df['sentiment'] == emotion].count().values[0] for emotion in emotions],
    labels={'x':'Emotion', 'y':'Count'},
    title='Count of each emotion in the dataset',
    width=700,
    height=400
)
emotions_fig.show()

In [38]:
df.head(25)

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
5,worry,Re-pinging @ghostridah14: why didn't you go to...
6,sadness,"I should be sleep, but im not! thinking about ..."
7,worry,Hmmm. http://www.djhero.com/ is down
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?
10,neutral,cant fall asleep


In [29]:
df[df['content'].str.contains('@')].count()

sentiment    18759
content      18759
dtype: int64

In [39]:
df_prov = df[~df['content'].str.contains('@')]
df_prov.count()

sentiment    20414
content      20414
dtype: int64

In [40]:
df_prov.head(10)

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
6,sadness,"I should be sleep, but im not! thinking about ..."
7,worry,Hmmm. http://www.djhero.com/ is down
10,neutral,cant fall asleep
11,worry,Choked on her retainers
12,sadness,Ugh! I have to beat this stupid song to get to...
14,surprise,Got the news
15,sadness,The storm is here and the electricity is gone


In [42]:
df_prov.to_csv('tweet_emotions_cleaned.csv', index=False)

In [44]:
with open('tweet_emotions_cleaned.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
print(data[0])

sadness,Layin n bed with a headache  ughhhh...waitin on your call...


In [46]:
with open('tweet_emotions_cleaned.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for i, line in enumerate(data):
    line = line.split(',', 1)
    if not (len(line) < 2):
       jnl_entry = line[1]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + str(i),
           'emotion' : line[0],
           'journal_entry' : line[1],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset_x.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Cautious with code above here!!!

In [52]:
df_json = pd.read_json('Emotions_dataset_x.json')
df_json.head(10)

Unnamed: 0,id,emotion,journal_entry,sentiment_score,embedding
0,jnl_0,sadness,Layin n bed with a headache ughhhh...waitin o...,0.0,"[-0.08792363852262401, -0.046848628669977, -0...."
1,jnl_1,sadness,Funeral ceremony...gloomy friday...,-0.3612,"[0.028388714417815004, 0.10316153615713101, 0...."
2,jnl_2,enthusiasm,wants to hang out with friends SOON!,0.5255,"[-0.0038604605942960005, -0.08634921908378601,..."
3,jnl_3,sadness,"""I should be sleep, but im not! thinking about...",-0.6458,"[-0.034071017056703005, 0.05695656314492201, -..."
4,jnl_4,worry,Hmmm. http://www.djhero.com/ is down,0.0,"[0.007188264280557, -0.064834922552108, -0.060..."
5,jnl_5,neutral,cant fall asleep,0.0,"[0.027548283338546004, -0.037431392818689006, ..."
6,jnl_6,worry,Choked on her retainers,-0.4767,"[-0.050996229052543, 0.021203704178333, 0.0118..."
7,jnl_7,sadness,Ugh! I have to beat this stupid song to get to...,-0.8856,"[-0.015514682047069002, -0.039723295718431, 0...."
8,jnl_8,surprise,Got the news,0.0,"[-0.051133878529071, 0.06758019328117301, 0.04..."
9,jnl_9,sadness,The storm is here and the electricity is gone,0.0,"[-0.005294140428304, 0.11565639078617002, 0.09..."


In [51]:
df_snt_nut = df_json[df_json['sentiment_score'] == 0]
df_snt_nut['emotion'].unique()

array(['sadness', 'worry', 'neutral', 'surprise', 'relief', 'hate',
       'happiness', 'enthusiasm', 'love', 'fun', 'anger', 'boredom'],
      dtype=object)

In [53]:
df_snt_pos = df_json[df_json['sentiment_score'] > 0]
df_snt_pos['emotion'].unique()

array(['enthusiasm', 'sadness', 'neutral', 'happiness', 'worry', 'love',
       'fun', 'hate', 'surprise', 'relief', 'boredom', 'anger'],
      dtype=object)

In [54]:
df_snt_pos.head(10)

Unnamed: 0,id,emotion,journal_entry,sentiment_score,embedding
2,jnl_2,enthusiasm,wants to hang out with friends SOON!,0.5255,"[-0.0038604605942960005, -0.08634921908378601,..."
11,jnl_11,sadness,How are YOU convinced that I have always wante...,0.5574,"[-0.08251373469829501, 0.00045864749699800006,..."
21,jnl_21,neutral,feels strong contractions but wants to go out....,0.2846,"[-0.035520184785127, -0.08468713611364301, 0.0..."
25,jnl_25,happiness,mmm much better day... so far! it's still quit...,0.4926,"[-0.06708671897649701, -0.018208563327789, 0.0..."
28,jnl_28,worry,"""Bed!!!!!... its time,..... hope i go to schoo...",0.4655,"[-0.027786055579781, 0.039769552648067, 0.0704..."
31,jnl_31,neutral,Chocolate milk is so much better through a str...,0.2212,"[0.06041004136204701, -0.074816480278968, 0.03..."
34,jnl_34,enthusiasm,"""bed...sorta. today was good, sara has strep t...",0.6486,"[-0.042720355093479004, 0.021938582882285, 0.0..."
35,jnl_35,sadness,diesel yaris... 70mpg so sad its not availabl...,0.0281,"[0.0059376135468480005, 0.012858248315751001, ..."
40,jnl_40,happiness,"""So great to see Oin &amp; Cynthia. So happy....",0.8861,"[0.007940583862364, 0.09896065294742501, 0.014..."
42,jnl_42,neutral,Brothers Bloom won't be opening this weekend i...,0.4939,"[0.022833626717329, -0.00047447753604500006, 0..."
