In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import re
import plotly.express as px

# Download NLTK stopwords
nltk.download('stopwords')

# Load model
glove_model = api.load("glove-wiki-gigaword-100")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BS304\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return list(set(tokens))  # Remove duplicates by converting to set and back again to list


In [27]:
def get_glove_embedding(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return np.zeros(100)  # Return zero vector for out of vocabulary words

def get_narrative_vectors(narratives):
    words = preprocess_text(narratives)
    narrative_vectors = {word: get_glove_embedding(word) for word in words}
    return narrative_vectors


In [28]:
def calculate_similarity_matrix(words, embeddings):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

def cluster_words(embeddings, num_clusters=10):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)
    return clusters


In [29]:
def plot_interactive_heatmap(words, similarity_matrix):
    df = pd.DataFrame(similarity_matrix, index=words, columns=words)
    fig = px.imshow(df, color_continuous_scale='RdBu', aspect='auto')
    fig.update_layout(title='Word Similarity Heatmap')
    fig.show()


In [30]:
def plot_interactive_3d_tsne(words, embeddings, clusters):
    tsne = TSNE(n_components=3, random_state=0)
    reduced_embeddings = tsne.fit_transform(embeddings)
    
    df = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'z': reduced_embeddings[:, 2],
        'word': words,
        'cluster': clusters
    })
    
    fig = px.scatter_3d(
        df, x='x', y='y', z='z', text='word', color='cluster',
        title='Word Embeddings 3D t-SNE Plot'
    )
    fig.update_traces(marker=dict(size=5), textposition='top center')
    fig.update_layout(
        width=1000,
        height=800
    )
    fig.show()


In [31]:
# Sample narrative
narratives = """
The user’s financial transactions and analysis data for the month of January 2024 show specific patterns in spending and earnings behavior, influenced by specific events and timing within the month.

At the start of the month, the user’s spending is primarily focused on fixed expenses such as rent, utility bills, and insurance payments. For example, on January 2nd, there was a payment of $1200 for rent and $150 for utilities. This period is characterized by predictable, high-value transactions.

Mid-month transactions show discrete spending. From January 10th to January 20th, the user made several purchases at restaurants and retail stores. Notable transactions include a $75.50 dinner at a downtown restaurant on January 15th and a $200 purchase at a clothing store on January 18th.

Towards the end of the month, there is a noticeable reduction in spending. From January 25th onwards, the transactions are primarily related to groceries and transportation, with amounts significantly lower than mid-month. For instance, on January 26th, there was a $50 grocery shopping transaction and a $30 refueling transaction.

Earnings patterns also show interesting trends. Early in the month, the user received a salary payment of $3000 on January 5th. Mid-month earnings include a $500 freelance project payment on January 15th and a $200 sale of old electronics on January 18th. There are no significant earnings recorded towards the end of the month.

In summary, the user’s spending is high at the beginning due to fixed expenses, peaks mid-month with discrete spending, and decreases towards the end. Earnings follow a predictable pattern with major income sources early and mid-month. This analysis suggests that the user might benefit from budgeting strategies to manage discretionary spending mid-month.
"""

In [32]:
# Generate embeddings
narrative_vectors = get_narrative_vectors(narratives)
words = list(narrative_vectors.keys())
embeddings = np.array([narrative_vectors[word] for word in words])

# Calculate similarity matrix
similarity_matrix = calculate_similarity_matrix(words, embeddings)

# Cluster words
clusters = cluster_words(embeddings)

In [33]:
# Plot interactive heatmap
plot_interactive_heatmap(words, similarity_matrix)

In [34]:
# Plot 3D t-SNE visualization with clusters
plot_interactive_3d_tsne(words, embeddings, clusters)

In [35]:
# Additional check for high similarity scores
word1, word2 = "spending", "earnings"
idx1, idx2 = words.index(word1), words.index(word2)
similarity_score = similarity_matrix[idx1, idx2]
print(f"Similarity score between '{word1}' and '{word2}': {similarity_score}")

# Print the embeddings for a closer look (optional)
print(f"Embedding for '{word1}': {embeddings[idx1]}")
print(f"Embedding for '{word2}': {embeddings[idx2]}")

Similarity score between 'spending' and 'earnings': 0.5340098244622112
Embedding for 'spending': [-0.0029174   0.52859998  0.40695    -0.42234999  0.32203999 -0.45745
 -0.95889997  0.45104     0.16066    -0.020425    0.21098    -0.28920001
 -0.56001002 -0.35977    -0.0078353  -0.13479    -0.31555     0.54291999
  0.20197     0.003839    0.43428999  0.069572   -0.28909999  0.30272999
 -0.99988002 -0.21137001 -0.68454999 -0.69090003 -1.16960001 -0.46623999
  0.033396    0.26767999 -0.072978   -0.48705    -0.69981998  0.77079999
 -0.39142001 -0.50326997 -0.31799001  0.42339    -0.05432    -0.93972999
  0.1719      1.11240005 -0.14650001 -0.50080001  0.018196   -0.50959003
  0.12656    -1.11020005  0.51003999 -0.18393999 -0.18445     1.1638
 -0.003409   -2.33529997  0.66048002 -0.59214002  1.34230006  0.64442998
  0.39320001 -0.18138    -0.45379001 -0.18861     0.25576001  0.70766002
 -0.057211   -0.12537     1.04949999 -0.46741     0.68848002 -0.56238002
 -0.62643999  0.35856    -0.534160