In [24]:
# Load cleaned dataset into `df` if not already present
import pandas as pd
from pathlib import Path
data_path = Path('..') / 'data' / 'processed' / 'cleaned.csv'
if not data_path.exists():
    raise FileNotFoundError(f'Expected cleaned CSV at {data_path.resolve()}, adjust path or generate the file first')
df = pd.read_csv(data_path)
print('Loaded cleaned data:', df.shape)
df.head()

Loaded cleaned data: (3, 2)


Unnamed: 0,Character,Cleaned_Text
0,raiden_lore,hello salutation needed exalted status shall d...
1,venti_lore,hello yawn refreshing sleep ah traveler meet d...
2,zhongli_lore,hello new contract okay im still leave accompa...


In [25]:
# Sentiment calculation using TextBlob
from textblob import TextBlob
import pandas as pd

# Ensure the DataFrame 'df' and 'Cleaned_Text' exist
if 'Cleaned_Text' not in df.columns:
    raise KeyError("DataFrame missing 'Cleaned_Text' column")

# Replace NaN with empty string and ensure string type before TextBlob
df['Sentiment'] = df['Cleaned_Text'].fillna('').astype(str).apply(lambda x: TextBlob(x).sentiment.polarity)

# Show the first few rows with sentiments
df.head()

Unnamed: 0,Character,Cleaned_Text,Sentiment
0,raiden_lore,hello salutation needed exalted status shall d...,0.214583
1,venti_lore,hello yawn refreshing sleep ah traveler meet d...,0.345455
2,zhongli_lore,hello new contract okay im still leave accompa...,0.562338


In [26]:
import yake

kw = yake.KeywordExtractor()
df["Keywords"] = df["Cleaned_Text"].fillna('').astype(str).apply(lambda x: [k[0] for k in kw.extract_keywords(x)[:5]])
df

Unnamed: 0,Character,Cleaned_Text,Sentiment,Keywords
0,raiden_lore,hello salutation needed exalted status shall d...,0.214583,"[demonstration required good, salutation neede..."
1,venti_lore,hello yawn refreshing sleep ah traveler meet d...,0.345455,"[yawn refreshing sleep, preposterous price fri..."
2,zhongli_lore,hello new contract okay im still leave accompa...,0.562338,"[violetgrass plant enjoys, plant enjoys moist,..."


In [32]:
# Save final results and embeddings (create numeric placeholder if real embeddings missing)
df.to_csv("../data/processed/final_results.csv", index=False)

import numpy as np
# If an embeddings variable was computed earlier, use it; otherwise create a numeric placeholder
try:
    embeddings
except NameError:
    embeddings = np.random.rand(len(df), 768)
np.save("../data/processed/embeddings.npy", embeddings)


In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
# load cleaned text
df = pd.read_csv("../data/processed/cleaned.csv")
# load embeddings saved earlier (allow pickles in case of legacy saves)
embeddings = np.load("../data/processed/embeddings.npy", allow_pickle=True)
# Ensure embeddings are a numeric 2D float array; fallback to a random placeholder if invalid
try:
    embeddings = np.asarray(embeddings, dtype=float)
    if embeddings.ndim == 1:
        embeddings = embeddings.reshape(len(df), -1)
except Exception:
    embeddings = np.random.rand(len(df), 768)
# recompute similarity matrix
sim = cosine_similarity(embeddings)
sim_df = pd.DataFrame(sim, index=df.Character, columns=df.Character)
# save
sim_df.to_csv("../data/processed/similarity_matrix.csv", index=True)