# Modelos de Sentence Similarity

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import torch

## Calculo embeddings frases entrenamiento

In [None]:
train = pd.read_csv('train.csv')

In [None]:
def create_embeddings_dict(model, train=train):
    # Crea un diccionario para almacenar los embeddings de las frases para cada etiqueta
    embeddings_dict = {label: [] for label in range(1, 22)}
    # Calcula los embeddings para cada frase y almacénalos en el diccionario
    for _, row in train.iterrows():
        frase = row['text']
        label = row['label']
        embedding = model.encode(frase)
        embeddings_dict[label].append(embedding)
    
    return embeddings_dict
    

In [None]:
"""def classify_texts(model,embeddings_dict, classify=classify):
    results = pd.DataFrame(columns=['docid', 'text', 'label', 'relevance'])
    for _, row in classify.iterrows():
        docid = row['docid']
        text = row['text']

        # Calcula el embedding de la frase
        text_embedding = model.encode(text).reshape(1, -1)

        # Calcula la similitud con cada síntoma
        similarities = {label: cosine_similarity(text_embedding, embeddings).mean() for label, embeddings in embeddings_dict.items()}

        # Encuentra el síntoma con la mayor similitud
        label, relevance = max(similarities.items(), key=lambda x: x[1])

        rescaled_relevance = np.interp(relevance, (0, 1), (0, 10))

        # Añade los resultados al DataFrame
        new_data = [docid, text, label, rescaled_relevance]
        results.loc[len(results)] = new_data
        
    return results"""

función por batches por si no caben todos los embeddings en memoria

In [None]:
"""def classify_texts(model, embeddings_dict, classify = classify, batch_size=10000):
    results = pd.DataFrame(columns=['docid', 'text', 'label', 'relevance'])
    # Para cada lote de frases en el DataFrame
    for i in range(0, len(classify), batch_size):
        batch = classify[i:i+batch_size]

        # Calcula los embeddings para las frases en el lote
        batch_embeddings = model.encode(batch['text'].tolist())

        # Para cada frase en el lote
        for docid, text, text_embedding in zip(batch['docid'], batch['text'], batch_embeddings):
            # Calcula la similitud con cada síntoma
            similarities = {label: cosine_similarity(text_embedding.reshape(1, -1), embeddings).mean() for label, embeddings in embeddings_dict.items()}

            # Encuentra el síntoma con la mayor similitud
            label, relevance = max(similarities.items(), key=lambda x: x[1])

            # Escala la relevancia para que esté en el rango de 0 a 10
            relevance = np.interp(relevance, (0, 1), (0, 10))

            # Añade los resultados al DataFrame
            new_data = [docid, text, label, relevance]
            results.loc[len(results)] = new_data

    return results"""

In [None]:
def classify_texts(model, embeddings_dict, classify):
    classify = pd.read_csv(classify)
    # Calcula los embeddings de todas las frases a la vez
    classify['text'] = classify['text'].fillna('')
    text_embeddings = model.encode(classify['text'].tolist())

    # Calcula la similitud con cada síntoma para cada frase
    similarities = np.array([cosine_similarity(text_embeddings, embeddings).mean(axis=1) for label, embeddings in embeddings_dict.items()])

    # Encuentra el índice del síntoma con la mayor similitud para cada frase
    max_similarity_indices = np.argmax(similarities, axis=0)

    # Usa los índices para obtener las etiquetas y las relevancias correspondientes
    labels = np.array(list(embeddings_dict.keys()))[max_similarity_indices]
    relevances = np.max(similarities, axis=0)

    # Reescala las relevancias
    rescaled_relevances = np.interp(relevances, (0, 1), (0, 10))

    # Crea el DataFrame de resultados
    results = pd.DataFrame({
        'docid': classify['docid'],
        'text': classify['text'],
        'label': labels,
        'relevance': rescaled_relevances
    })
    
    return results

# Modelos

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### all-MiniLM-L6-v2

In [None]:
# Carga el modelo pre-entrenado
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
model.to(device)

In [None]:
embeddings_dict = create_embeddings_dict(model)

In [None]:
classify_texts(model, embeddings_dict).sort_values(by='relevance', ascending=False).to_csv('results_ll-MiniLM-L6-v2.csv', index=False)

### all-MiniLM-L12-v2

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

In [None]:
model.to(device)

In [None]:
embeddings_dict = create_embeddings_dict(model)

In [None]:
classify_texts(model, embeddings_dict).sort_values(by='relevance', ascending=False).to_csv('results_ll-MiniLM-L12-v2.csv', index=False)

### all-mpnet-base-v2

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
model.to(device)

In [None]:
embeddings_dict = create_embeddings_dict(model)

In [None]:
classify_texts(model, embeddings_dict).sort_values(by='relevance', ascending=False).to_csv('results_ll-mpnet-base-v2.csv', index=False)

# Métricas de Evaluación

In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
def get_prediction(text, embeddings_dict):

    text_embedding = model.encode(text)

    similarities = {label: cosine_similarity(text_embedding.reshape(1, -1), embeddings).mean() for label, embeddings in embeddings_dict.items()}
    label = max(similarities.items(), key=lambda x: x[1])[0]

    return label

In [None]:
train = pd.read_csv('train.csv')
_ , X_val = train_test_split(train, test_size=0.15, random_state=42, stratify=train['label'])
np.random.seed(42)
dict_dataset= DatasetDict()
dict_dataset['test'] = Dataset.from_pandas(X_val)

In [None]:
dict_dataset = dict_dataset.remove_columns(['__index_level_0__', 'length', 'docid'])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = ['sentence-transformers/all-MiniLM-L6-v2', 'sentence-transformers/all-MiniLM-L12-v2', 'sentence-transformers/all-mpnet-base-v2']

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
for model in models:
    model = SentenceTransformer(model)
    model.to(device)
    embeddings_dict = create_embeddings_dict(model)
    y_pred = [get_prediction(text, embeddings_dict) for text in dict_dataset['test']['text']]
    y_true = dict_dataset['test']['label']
    print(classification_report(y_true, y_pred))
    micro_precision = precision_score(y_true, y_pred, average='micro')
    micro_recall = recall_score(y_true, y_pred, average='micro')
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    print(f'Micro Precision: {micro_precision}')
    print(f'Micro Recall: {micro_recall}')
    print(f'Micro F1-Score: {micro_f1}')
    