## Disminuir dimension de clases

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
import re

# Load data
df = pd.read_excel('data.xlsx')
etiquetas = df['Etiqueta'].values.tolist()

# Define a tokenizer that removes punctuation and lowercase words
def my_tokenizer(text):
    tokens = text.lower().split()
    tokens = [re.sub('[^A-Za-z]+', '', token) for token in tokens]
    return tokens

# Configure vectorizer
vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, stop_words='english')

# Convert text to vectors
X = vectorizer.fit_transform(etiquetas)

# Apply Agglomerative Clustering algorithm
agglo = AgglomerativeClustering(n_clusters=60)
agglo.fit(X.toarray())

# Get the most common phrase for each cluster
cluster_phrases = []
for i in range(agglo.n_clusters_):
    cluster_indices = [j for j in range(len(etiquetas)) if agglo.labels_[j] == i]
    cluster_phrases.append(Counter([etiquetas[index] for index in cluster_indices]).most_common(1)[0][0])

# Assign the most common phrase for each cluster to a new column in the DataFrame
df['cluster_phrase'] = [cluster_phrases[label] for label in agglo.labels_]

# Print the resulting clusters and their assigned phrases
for i in range(agglo.n_clusters_):
    print("Grupo ", i+1, ":")
    group = []
    for j in range(len(etiquetas)):
        if agglo.labels_[j] == i:
            group.append(etiquetas[j])
    print(group)
    print("Frase clave del grupo: ", cluster_phrases[i])
    print()
df.to_excel("cluster.xlsx")

