## Mining the synthetic dataset

In [None]:
import pandas as pd
import numpy as np
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
import random
import faiss

In [None]:
# Testing the sentiment analysis on the emotions dataset
analyzer = SentimentIntensityAnalyzer()

with open('Emotions_dataset.csv') as file:
    data = file.read().split('\n')
# Randomly select a journal entry
line = data[random.randint(0, len(data))]
print(line)
line = line.split(',', 2)
jnl_entry = line[2]
sentiment = analyzer.polarity_scores(jnl_entry)
print(sentiment['compound'])


In [None]:
# Testing the embedding model
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding = emb_model.encode(jnl_entry)
print(embedding.shape)

# Creating JSON objects from the dataset

In [None]:
# Reading the data from the csv file.
with open('Emotions_dataset.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for line in data:
    line = line.split(',', 2)
    if not (len(line) < 2):
       jnl_entry = line[2]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + line[0],
           'emotion' : line[1],
           'journal_entry' : line[2],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Storing the embedded data in the vector database

In [None]:
# Testing the vector data storage
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    data = json.load(file)
line = data[0]
embedding = line['embedding']
index.add(np.array([embedding]))

In [None]:
DIMENSIONS = 384
# Creating an instance of the faiss index.
index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
embeddings = np.array([entry['embedding'] for entry in jnl_entries])
index.add(embeddings)

def search(query, index, emb_model, jnl_entries, k=10):
    '''This function takes the user's query and returns the top k journal entries that are similar to the query.'''
    query_embedding = np.array(emb_model.encode(query)).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    results = [{'journal_entry':jnl_entries[i]['journal_entry'], 'emotion': jnl_entries[i]['emotion'], 'sentiment_score':jnl_entries[i]['sentiment_score']} for i in indices[0]]
    return results, distances

In [None]:
user_query = "Because ridiculously attractive people also have self esteem issues and depression Guess what honey The world sucks for everyone There’s no escape We will all die in misery and alone"
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
search_results, distances  = search(user_query, index, emb_model, jnl_entries)

# using the index and the retrieved context to generate information

In [None]:
%pip install langchain_community tiktoken langchain-openai langchainhub langchain

In [None]:
%pip install langchain_core

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
# Testing the generation of the prompt
# List of emotions to choose from
EMOTIONAL_STATES = ["Hopeful", "Anxious", "Inspired", "Overwhelmed", "Peaceful", "Frustrated", "Curious", "Uncertain", "Hopelessness"]
# Prompt template
template = """Give the most relevant emotion to the following journal entry based on the sentiment score and the mapped emotions from the given context.
context: {context}
journal entry: {journal_entry}
Note: only choose from the following emotions and only output that emotion: {EMOTIONAL_STATES}"""
# Add the template to the prompt
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 0)
# Combining the prompt and the language model
chain = prompt | llm
response = chain.invoke({'context': search_results, 'journal_entry': user_query, 'EMOTIONAL_STATES': EMOTIONAL_STATES})
# Extracting the emotion from the full response
emotion = response.content
emotion

In [None]:
# Improved version of prompt generation
# TODO: find a way to provide the context to the model with out retrieving it separately
# DIMENSIONS = 384
# Creating an instance of the faiss index.
# index = faiss.IndexFlatL2(DIMENSIONS)
with open('Emotions_dataset.json', 'r') as file:
    jnl_entries = json.load(file)
documents = [Document(page_content=entry['journal_entry']) for entry in jnl_entries]
# embeddings = np.array([entry['embedding'] for entry in jnl_entries])
emb_model = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# index.add(embeddings)
vector_store = FAISS.from_documents(documents=documents, embedding=emb_model)
vector_store.save_local('emotions_vector_store')

In [None]:
loaded_vectors = FAISS.load_local('emotions_vector_store', embeddings=emb_model, allow_dangerous_deserialization=True)
retriever = loaded_vectors.as_retriever(k=10)
llm = ChatOpenAI(model_name = "gpt-4o-mini", temperature = 0)
rag_chain = (  
    prompt
    | llm
    | StrOutputParser()
)
rag_chain.invoke({'context': retriever, 'journal_entry': user_query, 'EMOTIONAL_STATES': EMOTIONAL_STATES})


### Only run the code above if absolutely necessary, and only do so selectively!!!

# Cleaning and preprocessing the new dataset

In [10]:
import pandas as pd
import plotly.express as px

In [None]:
df = pd.read_csv('tweet_emotions.csv')
df.head()

In [None]:
df.drop(columns=['tweet_id'], inplace=True)

In [None]:
df.info()

In [None]:
df.sentiment.unique()

In [None]:
df[df['sentiment'] == 'empty'].count()

In [None]:
df.drop(df[df['sentiment'] == 'empty'].index, inplace=True)

In [None]:
emotions = df['sentiment'].unique()
for emotion in emotions:
    print(emotion, end=': ')
    count = df[df['sentiment'] == emotion].count().values[0]
    print(count)

In [None]:
emotions_fig = px.bar(
    x=emotions,
    y=[df[df['sentiment'] == emotion].count().values[0] for emotion in emotions],
    labels={'x':'Emotion', 'y':'Count'},
    title='Count of each emotion in the dataset',
    width=700,
    height=400
)
emotions_fig.show()

In [None]:
df.head(25)

In [None]:
df[df['content'].str.contains('@')].count()

In [None]:
df_prov = df[~df['content'].str.contains('@')]
df_prov.count()

In [None]:
df_prov.head(10)

In [None]:
df_prov.to_csv('tweet_emotions_cleaned.csv', index=False)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df_cleaned = pd.read_csv('tweet_emotions_cleaned.csv')
indices = np.arange(len(df_cleaned))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
train_df = df_cleaned.iloc[train_indices]
test_df = df_cleaned.iloc[test_indices]
train_df.to_csv('tweet_emotions_train.csv', index=False)
test_df.to_csv('tweet_emotions_test.csv', index=False)

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer

In [8]:
import json

In [None]:
with open('tweet_emotions_cleaned.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
print(data[0])

In [9]:
with open('tweet_emotions_train.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
snt_analyzer = SentimentIntensityAnalyzer()
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for i, line in enumerate(data):
    line = line.split(',', 1)
    if not (len(line) < 2):
       jnl_entry = line[1]
       # Getting the sentiment score of the journal entry.
       sentiment = snt_analyzer.polarity_scores(jnl_entry)
       # Embedding the journal entry.
       embedding = emb_model.encode(jnl_entry)
       entry = {
           'id' : "jnl_" + str(i),
           'emotion' : line[0],
           'journal_entry' : line[1],
           'sentiment_score' : sentiment['compound'],
           'embedding' : embedding.tolist()
       }
       entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset_vader_am.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

# Cautious with code above here!!!

In [None]:
df_json = pd.read_json('Emotions_dataset_vader_am.json')
df_json.head(10)

confusion metrics -> histogram

In [None]:
df_json['journal_entry'][0]

In [None]:
df_snt_neu = df_json[df_json['sentiment_score'] == 0]
df_snt_neu['emotion'].unique()

In [None]:
df_snt_pos = df_json[df_json['sentiment_score'] > 0]
df_snt_pos['emotion'].unique()

In [None]:
df_snt_pos.head(10)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

# Load the model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)
        scores = F.softmax(outputs.logits, dim=1) 
    # The model returns scores for 3 classes: negative (0), neutral (1), positive (2)
    # We can calculate the sentiment by subtracting the negative score from the positive score
    neg_score = float(scores[0][0])
    neu_score = float(scores[0][1])
    pos_score = float(scores[0][2])
    raw_score = pos_score - neg_score
    # Adjust for uncertainty
    adjusted_score = raw_score * (1- neu_score)
    return {
        "raw_score": raw_score,
        "adjusted_score": adjusted_score,
        "positive_score": pos_score,
        "neutral_score": neu_score,
        "negative_score": neg_score
    }

# Example usage
text = "Layin n bed with a headache  ughhhh...waitin on your call..."
result = analyze_sentiment(text)
print(result)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F 
import openai

In [None]:
with open('tweet_emotions_train.csv', 'r') as file:
    data = file.read().split('\n')
# Removing the header from the data.
data = data[1:]
# Setting up the sentiment analysis model
snt_model = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(snt_model)
snt_analyzer = AutoModelForSequenceClassification.from_pretrained(snt_model)
# Setting up the embedding model
emb_model = "text-embedding-3-small"

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    # Get model prediction
    with torch.no_grad():
        outputs = snt_analyzer(**inputs)
        scores = F.softmax(outputs.logits, dim=1) 
    # The model returns scores for 3 classes: negative (0), neutral (1), positive (2)
    # We can calculate the sentiment by subtracting the negative score from the positive score
    neg_score = float(scores[0][0])
    neu_score = float(scores[0][1])
    pos_score = float(scores[0][2])
    raw_score = pos_score - neg_score
    # Adjust for uncertainty
    adjusted_score = raw_score * (1- neu_score)
    return {
        # Returning all the values for testing purposes, remove unnecessary values later
        "raw_score": raw_score,
        "adjusted_score": adjusted_score,
        "positive_score": pos_score,
        "neutral_score": neu_score,
        "negative_score": neg_score
    }

# Splitting the data into the respective columns and storing them in a list of dictionaries.
entries = []
for i, line in enumerate(data):
    line = line.split(',', 1)
    if not (len(line) < 2):
        jnl_entry = line[1]
        # Getting the sentiment score of the journal entry.
        results = analyze_sentiment(jnl_entry)
        # Embedding the journal entry.
        emb_response = openai.embeddings.create(
            input=jnl_entry,
            model=emb_model
        )
        embedding = emb_response.data[0].embedding
        # Storing the data in a dictionary
        entry = {
            'id' : "jnl_" + str(i),
            'emotion' : line[0],
            'journal_entry' : line[1],
            'sentiment_score' : results.get('adjusted_score'),
            'embedding' : embedding
        }
        entries.append(entry)
# Writing the data to a json file.
with open('Emotions_dataset_cardiff_oai.json', 'w') as file:
    file.write(json.dumps(entries, indent=4))

In [14]:
df_cardiff = pd.read_json('Emotions_dataset_cardiff_oai.json')

In [15]:
df_cardiff[df_cardiff['emotion'] == 'neutral'].head(10)

Unnamed: 0,id,emotion,journal_entry,sentiment_score,embedding
1,jnl_1,neutral,is installing the Iphone and Ipod touch sdk 2....,0.039998,"[0.031230790540575003, -0.002684272127225, -0...."
3,jnl_3,neutral,I have to go to work now.,-0.25922,"[-0.0018639501649880001, 0.037988256663084, -0..."
4,jnl_4,neutral,Salad from krogers... I was hungry.,0.035736,"[-0.052385926246643004, -0.029814580455422002,..."
12,jnl_12,neutral,Just work up,0.079786,"[0.0074366098269820005, 0.021141819655895, -0...."
14,jnl_14,neutral,I hate funerals.,-0.94505,"[-0.005770623218268001, 0.003911630250513, -0...."
24,jnl_24,neutral,Miss Cauzinhoooo already,-0.188362,"[0.046856377273797004, -0.0016631120815870002,..."
26,jnl_26,neutral,Same Difference Today going to go and have a ...,0.150722,"[-0.00887294486165, 0.006200557108968, -0.0235..."
41,jnl_41,neutral,NOT excited for 32 people reservation in the m...,0.155015,"[0.023313973098993003, 0.025849299505352002, -..."
50,jnl_50,neutral,character designs complete! .... in about a we...,0.364673,"[0.031505569815635, 0.012621497735381002, -0.0..."
53,jnl_53,neutral,today is a busy day. exhausting!,-0.316422,"[-0.005164267960935, 0.018428282812237, -0.047..."


In [12]:
df_distil = pd.read_json("Emotions_dataset_distil_oai.json")

In [13]:
df_distil[df_distil['emotion'] == 'neutral'].head(10)

Unnamed: 0,id,emotion,journal_entry,sentiment_score,embedding
1,jnl_1,neutral,is installing the Iphone and Ipod touch sdk 2....,-0.510728,"[0.031205531209707003, -0.002723266137763, -0...."
3,jnl_3,neutral,I have to go to work now.,-0.991278,"[-0.0018639501649880001, 0.037988256663084, -0..."
4,jnl_4,neutral,Salad from krogers... I was hungry.,-0.986555,"[-0.052385926246643004, -0.029814580455422002,..."
12,jnl_12,neutral,Just work up,0.999224,"[0.0074681695550680004, 0.021164717152714, -0...."
14,jnl_14,neutral,I hate funerals.,-0.993923,"[-0.005717813502997, 0.0039062288124110003, -0..."
24,jnl_24,neutral,Miss Cauzinhoooo already,-0.979553,"[0.046856377273797004, -0.0016631120815870002,..."
26,jnl_26,neutral,Same Difference Today going to go and have a ...,0.473131,"[-0.00887294486165, 0.006200557108968, -0.0235..."
41,jnl_41,neutral,NOT excited for 32 people reservation in the m...,-0.994857,"[0.023314835503697003, 0.025850255042314002, -..."
50,jnl_50,neutral,character designs complete! .... in about a we...,-0.44069,"[0.031540501862764005, 0.012770409695804001, -..."
53,jnl_53,neutral,today is a busy day. exhausting!,-0.998356,"[-0.005177229642868, 0.018358102068305, -0.047..."
