In [10]:
import pandas as pd

data = pd.read_pickle("../data/initial_topics_labels/intial_topics_labels.pk")
        
all_topics = list(set([t_dict['topic'] for sentence_topics in data['topics'] for t_dict in sentence_topics]))

print(len(all_topics))

3221


In [12]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for each concept
embeddings = model.encode(all_topics)

print(embeddings[0].shape)

# Reduce dimensionality of embeddings for visualization
tsne = TSNE(n_components=2, perplexity=5, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)



(384,)


In [15]:
aggregated_concepts = [
    'Time & Events', 'Threat or Risk', 'Social setting/impact', 'Food and drink', 'Insult', 'Abstraction',
    'Information and knowledge', 'Health Medicine', 'Negative sentiments', 'Death', 'Fitness and Skills', 'Migration',
    'Animals', 'Internet and media', 'Race or ethnicity', 'Aggression and Violence', 'Identity',
    'Behavior & Attitude', 'Racial stereotypes', 'Jobs & Institutions', 'Culture',
    'Finance and wealth', 'Children', 'Personal tastes & Art', 'Relationships',
    'Addictions/Drugs', 'History', 'Claim/Opinion', 'Safety', 'Moral issues',
    'Crime', 'Gender', 'Personal traits/taste', 'Locations/Geography', 'Politics', 'Groups/Names',
    'Society & Culture', 'Emotions', 'Law and regulation', 'Language and communication', 'Brand/Product',
    'Life', 'Environment', 'Geopolitics/National Issue', 'War', 'Family', 'Technology/equipment',
    'Sexuality', 'Media/entertainment', 'Religion'
]

num_clusters = len(aggregated_concepts)

In [16]:
clustering_model = KMeans(n_clusters=num_clusters, n_init='auto',random_state=42)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

In [18]:
print(len(cluster_assignment))

3221


In [19]:
from collections import defaultdict

clustered_concepts = defaultdict(list)

for i, cluster_id in enumerate(cluster_assignment):
    clustered_concepts[cluster_id].append(all_topics[i])


In [20]:
def find_cluster_id(clustered_concepts, word):
    for cluster_id, topics in clustered_concepts.items():
        if word in topics:
            return cluster_id
    return -1

In [21]:
data = pd.read_csv("../data/broad_topic/broad_topic_df.csv")
data.head(2)

Unnamed: 0,text,sentence_number,topics,label
0,social services have a love/hate relationship ...,0,"['Health Medicine', 'Jobs & Institutions']",nothate
1,Yes it's a joke and offensive isn't it? Why ar...,1,"['Gender', 'Social setting/impact']",nothate


In [41]:
from itertools import permutations
import numpy as np
import ast

occurrencies = np.zeros((num_clusters, num_clusters), dtype=int)

for topics in data['topics']:
    topics = ast.literal_eval(topics)
    
    numerical_topics = [aggregated_concepts.index(topic) for topic in topics]
    
    for (i, j) in permutations(numerical_topics, 2):
        occurrencies[i, j] += 1

In [42]:
print(occurrencies)

[[ 0 17 61 ... 10 18 19]
 [17  0 14 ... 21  9 17]
 [61 14  0 ... 16  7 34]
 ...
 [10 21 16 ...  0 12 22]
 [18  9  7 ... 12  0  8]
 [19 17 34 ... 22  8  0]]


In [60]:
THRESHOLD = 12

def can_add_to_group(category_index, group, matrix, threshold):
    return all(matrix[category_index, i] <= threshold for i in group)

groups = []

for i in range(num_clusters):
    added = False
    for group in groups:
        if can_add_to_group(i, group, occurrencies, THRESHOLD):
            group.append(i)
            added = True
            break
    if not added:
        groups.append([i])

grouped_categories = [[aggregated_concepts[i] for i in group] for group in groups]

print(len(grouped_categories))

print("Gruppi di categorie:")
for group in grouped_categories:
    print(group)

12
Gruppi di categorie:
['Time & Events', 'Food and drink', 'Abstraction', 'Death', 'Migration', 'Animals', 'Internet and media', 'Identity', 'Culture', 'Children', 'Personal tastes & Art', 'Addictions/Drugs', 'Safety', 'Language and communication', 'Brand/Product', 'Life', 'War', 'Family', 'Technology/equipment']
['Threat or Risk', 'Insult', 'Negative sentiments', 'Fitness and Skills', 'Behavior & Attitude', 'Finance and wealth', 'History', 'Moral issues', 'Crime', 'Society & Culture', 'Emotions', 'Law and regulation', 'Environment', 'Geopolitics/National Issue', 'Media/entertainment']
['Social setting/impact', 'Information and knowledge', 'Aggression and Violence']
['Health Medicine', 'Relationships']
['Race or ethnicity', 'Sexuality']
['Racial stereotypes', 'Personal traits/taste']
['Jobs & Institutions', 'Locations/Geography']
['Claim/Opinion']
['Gender']
['Politics']
['Groups/Names']
['Religion']


In [62]:
dict = {}

for i, group in enumerate(grouped_categories):
    for cat in group:
        dict[cat] = i

In [63]:
dict

{'Time & Events': 0,
 'Food and drink': 0,
 'Abstraction': 0,
 'Death': 0,
 'Migration': 0,
 'Animals': 0,
 'Internet and media': 0,
 'Identity': 0,
 'Culture': 0,
 'Children': 0,
 'Personal tastes & Art': 0,
 'Addictions/Drugs': 0,
 'Safety': 0,
 'Language and communication': 0,
 'Brand/Product': 0,
 'Life': 0,
 'War': 0,
 'Family': 0,
 'Technology/equipment': 0,
 'Threat or Risk': 1,
 'Insult': 1,
 'Negative sentiments': 1,
 'Fitness and Skills': 1,
 'Behavior & Attitude': 1,
 'Finance and wealth': 1,
 'History': 1,
 'Moral issues': 1,
 'Crime': 1,
 'Society & Culture': 1,
 'Emotions': 1,
 'Law and regulation': 1,
 'Environment': 1,
 'Geopolitics/National Issue': 1,
 'Media/entertainment': 1,
 'Social setting/impact': 2,
 'Information and knowledge': 2,
 'Aggression and Violence': 2,
 'Health Medicine': 3,
 'Relationships': 3,
 'Race or ethnicity': 4,
 'Sexuality': 4,
 'Racial stereotypes': 5,
 'Personal traits/taste': 5,
 'Jobs & Institutions': 6,
 'Locations/Geography': 6,
 'Claim/