In [137]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

In [296]:
df = pd.read_csv("chores.csv")

In [297]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
embeddings = model.encode(df['Chore'].values, convert_to_numpy = True)
embeddings_kw = model.encode(df['ChoreKeyword'].values, convert_to_numpy = True)
np.save("embeddings.npy", embeddings)
np.save("embeddings_kw.npy", embeddings_kw)

## Difficulty Level

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

difficult_examples = [
    "Lavoro noioso",
    "Lavoro complicato",
    "Lavoro disgustoso",
    "Lavoro che impiega molto tempo",
]
difficult_embeddings = model.encode(difficult_examples, convert_to_numpy=True)

for i, label in enumerate(difficult_examples):
    df[label] = [cosine_similarity(emb, difficult_embeddings[i]) for emb in embeddings_kw]

df.rename(columns={
    "Lavoro noioso": "Noioso",
    "Lavoro complicato": "Complicato",
    "Lavoro disgustoso": "Disgustoso",
    "Lavoro che impiega molto tempo": "Tempo"
}, inplace=True)

## KNN

In [250]:
# Combina gli embeddings, dando maggiore peso alla keyword (es. 0.7) rispetto alla descrizione completa (es. 0.3)
combined_embeddings = 0.7 * embeddings + 0.3 * embeddings_kw

test_indices = np.random.choice(len(df), size=3, replace=False)
train_indices = np.setdiff1d(np.arange(len(df)), test_indices)

k = 3
results = []
for test_idx in test_indices:
    test_keyword = df.iloc[test_idx]["ChoreKeyword"]
    true_difficulty = df.iloc[test_idx]["Difficulty"]
    test_embedding = combined_embeddings[test_idx]
    
    similarities = [cosine_similarity(test_embedding, combined_embeddings[train_idx]) for train_idx in train_indices]
    top_k_local_indices = np.argsort(similarities)[-k:][::-1]
    neighbor_indices = train_indices[top_k_local_indices]
    
    neighbor_details = [
        (df.iloc[n]["ChoreKeyword"], df.iloc[n]["Difficulty"], cosine_similarity(test_embedding, combined_embeddings[n]))
        for n in neighbor_indices
    ]
    predicted_difficulty = np.mean([df.iloc[n]["Difficulty"] for n in neighbor_indices])
    
    results.append((test_keyword, true_difficulty, predicted_difficulty, neighbor_details))

mae = np.mean([abs(true_diff - pred_diff) for _, true_diff, pred_diff, _ in results])

for test_keyword, true_diff, pred_diff, neighbors in results:
    neighbor_str = "; ".join([f"{n[0]} (Diff: {n[1]}, Sim: {n[2]:.4f})" for n in neighbors])
    print(f"ChoreKeyword: {test_keyword} - True Difficulty: {true_diff} - Predicted: {pred_diff:.2f}\n({neighbor_str})\n---")
print("Mean Absolute Error:", mae)

ChoreKeyword: Auto - True Difficulty: 2.5 - Predicted: 1.83
(Lavatrice (Diff: 1.5, Sim: 0.5439); Lavatrice (filtro) (Diff: 2.0, Sim: 0.5347); Lavandino (Diff: 2.0, Sim: 0.5303))
---
ChoreKeyword: Indifferenziato - True Difficulty: 1.0 - Predicted: 1.67
(Plastica (Diff: 1.0, Sim: 0.4617); Dispensa (Diff: 1.5, Sim: 0.4522); Bidoni differenziata (Diff: 2.5, Sim: 0.4202))
---
ChoreKeyword: Armadio - True Difficulty: 2.0 - Predicted: 1.83
(Dispensa (Diff: 1.5, Sim: 0.6059); Bidet (Diff: 2.0, Sim: 0.5801); Spazzatura (Diff: 2.0, Sim: 0.5623))
---
Mean Absolute Error: 0.5000000000000001


### Inference

In [292]:
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')

stop_words = set(stopwords.words('italian'))
stemmer = SnowballStemmer("italian")
cleaning_verbs_stems = {"pulir", "lav", "spolver", "aspir", "sbrin", "stir", "mop"}

def extract_keyword(chore_str):
    tokens = re.findall(r'\w+', chore_str.lower())
    return " ".join(
        stemmer.stem(token)
        for token in tokens
        if token not in stop_words and token not in cleaning_verbs_stems
    )

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict_difficulty(chore_str, k=3, alpha=0.7):
    keyword_version = extract_keyword(chore_str)
    
    full_emb, keyword_emb = model.encode([chore_str, keyword_version], convert_to_numpy=True)
    input_emb = alpha * keyword_emb + (1 - alpha) * full_emb
    combined_embeddings = alpha * embeddings_kw + (1 - alpha) * embeddings

    sims = np.array([cosine_similarity(input_emb, emb) for emb in combined_embeddings])
    knn_indices = sims.argsort()[-k:][::-1]
    
    weights = softmax(sims[knn_indices])
    predicted_diff = np.dot(weights, df.iloc[knn_indices]["Difficulty"])
    
    neighbors = [(df.iloc[i]["Chore"], df.iloc[i]["Difficulty"], sims[i]) for i in knn_indices]
    
    return predicted_diff, neighbors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\markh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [295]:
difficulty, neighbors = predict_difficulty("riparare i tubi", k=3, alpha=0.7)
print("Predicted difficulty:", difficulty)
print("Neighbors:", neighbors)

Predicted difficulty: 1.6608907282352448
Neighbors: [('Pulire il tostapane', 1.0, 0.5945617), ('Pulire il bidet', 2.0, 0.58650476), ('Pulire i tappeti', 2.0, 0.55053353)]


## UMAP

In [337]:
import umap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Normalizzazione
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

# UMAP migliorato
umap_reducer = umap.UMAP(n_components=5, n_neighbors=5, min_dist=0.1, random_state=42)
embeddings_umap = umap_reducer.fit_transform(embeddings_scaled)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [347]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.7, min_samples=3)
clusters = dbscan.fit_predict(embeddings_umap)

In [348]:
import plotly.express as px
import pandas as pd

umap_reducer_3d = umap.UMAP(n_components=3, n_neighbors=10, min_dist=0.05, random_state=42)
embeddings_3d = umap_reducer_3d.fit_transform(embeddings_scaled)

df_3d = pd.DataFrame(embeddings_3d, columns=["UMAP_1", "UMAP_2", "UMAP_3"])
df_3d["Cluster"] = clusters
df_3d["Chore"] = df["Chore"]

# Grafico 3D interattivo
fig = px.scatter_3d(df_3d, x="UMAP_1", y="UMAP_2", z="UMAP_3", color=df_3d["Cluster"].astype(str), hover_data=["Chore"],
                     title="Clusterizzazione delle Chore in 3D", opacity=0.8)

# Salva e apri il plot in una finestra del browser
fig.write_html("plot.html")
import webbrowser
webbrowser.open("plot.html")


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



True