In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoConfig, BertPreTrainedModel, BertModel,AutoModel,LlamaPreTrainedModel, RobertaPreTrainedModel,AutoModelForSequenceClassification,RobertaModel, BertForSequenceClassification, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


Este cuadernillo contiene:
- la exploración de los datasets
- la implementación de la clasificación del texto haciendo uso de la librería Hugging Face para el baseline (BERT Y LLAMA)
- la comprobación de los mismos resultados usando la librería que con el código de adición de contexto social

# **DATASET1**
Users from Twitter who tweeted about the topic 'Obama' during the period 8-10 May 2013;
The last 32005 tweets for each user

Cite: Pozzi, F.A., Maccagnola, D., Fersini, E., Messina, E. (2013). Enhance User-Level Sentiment Analysis on Microblogs with Approval Relations. In: Baldoni, M., Baroglio, C., Boella, G., Micalizio, R. (eds) AI*IA 2013: Advances in Artificial Intelligence. AI*IA 2013. Lecture Notes in Computer Science(), vol 8249. Springer, Cham. https://doi.org/10.1007/978-3-319-03524-6_12

In [None]:
tweets = pd.read_pickle("../data/MIND/final_tweets_morality.pkl")
print('MIND DATASET')
print('Moral labels')
print(tweets['moral_label'].value_counts())
print('------')
print('Sentiment labels')
print(tweets['label'].value_counts())

In [None]:
users=pd.read_csv('../data/MIND/users.csv')


tweets=pd.read_csv('../data/MIND/tweets.csv')
#tweets = pd.read_csv('../data/tweets.csv', quotechar="'")
#tweets= tweets[['id','id_author','id_tweet','tweet','polarity']]
#tweets.rename(columns={'tweet':'text', 'polarity':'label'}, inplace=True)

network=pd.read_csv('../data/MIND/rt_network.csv')
network

- Visualización de la red de usuarios
- Visualización de embeddings con modelo ajustado y el modelo pre-entrenado

In [None]:
#RED DE TWEETS
user_tweets = tweets.rename(columns={'id_author': 'user', 'id_tweet': 'user_tweet'})
retweeted_tweets = tweets.rename(columns={'id_author': 'retweetedUser', 'id_tweet': 'retweeted_tweet'})
import pandas as pd

# Crear combinaciones entre todos los tweets del user y retweetedUser
user_tweets_expanded = pd.merge(network, user_tweets, on='user', how='inner')  # Vincular usuario y sus tweets
all_combinations = pd.merge(user_tweets_expanded, retweeted_tweets, on='retweetedUser', how='inner')  # Vincular con retweets

# Crear el dataframe de relaciones
tweet_edges = all_combinations[['user_tweet', 'retweeted_tweet', 'weight']].rename(
    columns={'user_tweet': 'source', 'retweeted_tweet': 'target'}
)





In [None]:
#red de usuarios
import matplotlib.pyplot as plt
import networkx as nx

# Crear el grafo
#G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight')
G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight', create_using=nx.MultiGraph())


color_map = []
for node in G.nodes():
    if node in users['id_author'].values:
        # Obtener la polaridad del usuario
        polarity = users[users['id_author'] == node]['polarity'].values[0]
        if polarity == 'neg':
            color_map.append('red')  # Nodo de polaridad negativa en rojo
        else:
            color_map.append('blue')  # Nodo de polaridad positiva en azul


In [None]:
pos = nx.spring_layout(G, seed=42)
plt.figure(figsize=(10, 8))
nx.draw(G, pos, with_labels=True, node_size=600, node_color=color_map, font_size=7, font_color='black', font_weight='bold', edge_color='gray')
# Añadir los pesos en las aristas (solo si el peso es mayor que 1)
for u, v, d in G.edges(data=True):
    if d['weight'] > 1:
        weight = d['weight']
        
        x1, y1 = pos[u]
        x2, y2 = pos[v]
        
        x = (x1 + x2) / 2
        y = (y1 + y2) / 2
        plt.text(x, y, str(weight), fontsize=8, ha='center', va='center', color='red')

plt.title("Retweet Network with User Polarities and Edge Weights")
plt.show()

In [None]:
print(f"Número de nodos: {G.number_of_nodes()}")
print(f"Número de aristas: {G.number_of_edges()}")

In [None]:
# Usar circular_layout para una disposición circular de los nodos
pos = nx.circular_layout(G)

# Dibujar el grafo con el layout circular
plt.figure(figsize=(10, 8))  # Ajustar el tamaño de la figura
nx.draw(G, pos, with_labels=True, node_size=300, node_color=color_map, font_size=10, font_color='black', font_weight='bold', edge_color='gray')
for u, v, d in G.edges(data=True):
    if d['weight'] > 1:
        weight = d['weight']
        
        x1, y1 = pos[u]
        x2, y2 = pos[v]
        
        x = (x1 + x2) / 2
        y = (y1 + y2) / 2
        plt.text(x, y, str(weight), fontsize=8, ha='center', va='center', color='red')

plt.show()

In [None]:
# Usar kamada_kawai_layout para una disposición que minimiza las distancias entre nodos conectados
pos = nx.kamada_kawai_layout(G)

# Dibujar el grafo con el layout Kamada-Kawai
plt.figure(figsize=(10, 8))  # Ajustar el tamaño de la figura
nx.draw(G, pos, with_labels=True, node_size=300, node_color=color_map, font_size=10, font_color='black', font_weight='bold', edge_color='gray')
for u, v, d in G.edges(data=True):
    if d['weight'] > 1:
        weight = d['weight']
        
        x1, y1 = pos[u]
        x2, y2 = pos[v]
        
        x = (x1 + x2) / 2
        y = (y1 + y2) / 2
        plt.text(x, y, str(weight), fontsize=8, ha='center', va='center', color='red')
plt.title("Retweet Network with User Polarities (Kamada-Kawai Layout)")
plt.show()


## SVD

In [None]:
import pandas as pd
users=pd.read_csv('../data/MIND/users.csv')
users= users[["id_author",'polarity']]

tweets=pd.read_csv('../data/MIND/tweets.csv')

network=pd.read_csv('../data/MIND/rt_network.csv')
network= network[['id','user','retweetedUser','weight']]
network = pd.merge(network, users, left_on='user', right_on='id_author', how='left')
network.drop('id_author', axis=1, inplace=True)
network


In [None]:
import networkx as nx
#scikit-network
from sknetwork.embedding import SVD
G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight')
A = nx.adjacency_matrix(G)
#Shallow Embedding (singular value decomposition)
#Python3 implementation of svd in scikit-network’s, Scikit-network: Graph Analysis in Python, {Thomas Bonald and Nathan de Lara and Quentin Lutz and Bertrand Charpentier
''' Aplicar a la matriz de adyacencia para aprender representaciones vectoriales (embeddings) de los nodos'''


'''Usa la matriz de adyacencia de la red, factorización de la matriz (en 3 matrices: m ortogonal (vectores singulares izq), matriz diagonal (valores singulares) y matriz otogonal(vectores singulares der)) para conservar su estructura y propiedades
a la vez que reduce su dimensión. '''

svd = SVD(32)
embedding = svd.fit_transform(A)


nodes = list(G.nodes())
df = pd.DataFrame({
    'node': nodes,
    'embedding': [embedding[i].tolist() for i in range(len(nodes))]})

df.head()

In [None]:
tweets = tweets.merge(df, how='left', left_on='id_author', right_on='node')
tweets.drop(columns=['node'], inplace=True)
tweets.rename(columns={'embedding':'extra_data'}, inplace=True)
tweets.head()

# Save DataFrame as a pickle file
#tweets.to_pickle('svd_df.pkl')

#tweets = pd.read_pickle('../models/svd_df.pkl')
#tweets

In [None]:
import pandas as pd
users=pd.read_csv('../data/MIND/users.csv')
users= users[["id_author",'polarity']]

tweets=pd.read_csv('../data/MIND/tweets.csv')

network=pd.read_csv('../data/MIND/rt_network.csv')
network= network[['id','user','retweetedUser','weight']]
network = pd.merge(network, users, left_on='user', right_on='id_author', how='left')
network.drop('id_author', axis=1, inplace=True)
network

G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight')
mapping = {node: i for i, node in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)

## DeepWalk

In [None]:
import pandas as pd
users=pd.read_csv('../data/MIND/users.csv')
users= users[["id_author",'polarity']]

tweets=pd.read_csv('../data/MIND/tweets.csv')

network=pd.read_csv('../data/MIND/rt_network.csv')
network= network[['id','user','retweetedUser','weight']]
network = pd.merge(network, users, left_on='user', right_on='id_author', how='left')
network.drop('id_author', axis=1, inplace=True)
network

In [None]:
from karateclub import DeepWalk
import networkx as nx

G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight')
mapping = {node: i for i, node in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)

model = DeepWalk(dimensions=32, walk_length=30, workers=4)
model.fit(G)
embeddings = model.get_embedding()
embeddings

reverse_mapping = {v: k for k, v in mapping.items()}  # Reverse the mapping
node_embeddings = {reverse_mapping[i]: embeddings[i] for i in range(len(embeddings))}

embeddings_df = pd.DataFrame(list(node_embeddings.items()), columns=['user', 'extra_data'])
tweets = tweets.merge(embeddings_df, how='left', left_on='id_author', right_on='user')
tweets.drop(columns=['user'], inplace=True)
tweets

In [None]:
# Save DataFrame as a pickle file
#tweets.to_pickle('deepwalk_df.pkl')

tweets = pd.read_pickle('../models/deepwalk_df.pkl')
#tweets

## Node2Vec

In [None]:
import pandas as pd
users=pd.read_csv('../data/MIND/users.csv')
users= users[["id_author",'polarity']]

tweets=pd.read_csv('../data/MIND/tweets.csv')

network=pd.read_csv('../data/MIND/rt_network.csv')
network= network[['id','user','retweetedUser','weight']]
network = pd.merge(network, users, left_on='user', right_on='id_author', how='left')
network.drop('id_author', axis=1, inplace=True)
network

In [None]:
import networkx as nx
from node2vec import Node2Vec

#Node2Vec
#Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan. node2vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016.
'''node2vec es un algoritmo que genera representaciones (embeddings) de nodos en un grafo, utiliza un enfoque basado 
en caminatas aleatorias (explorando nodos vecinos) para capturar la estructura del grafo'''


G = nx.from_pandas_edgelist(network, 'user', 'retweetedUser', edge_attr='weight')

node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, workers=1)  
model = node2vec.fit(window=10, min_count=1, batch_words=4)  

#node_embedding = model.wv['716543']
#print("Embedding for node 0:", node_embedding)

# Nodos similares a '716543'
#similar_nodes = model.wv.similar_by_vector(model.wv['716543'], topn=5)
#print("Most similar nodes to node 716543:", similar_nodes)

# Map node index to original node labels
node_embeddings = {node: model.wv[str(node)] for node in list(G.nodes())}



In [None]:
def get_embedding(node):
    try:
        return model.wv[str(node)]
    except KeyError:
        return np.zeros(model.wv.vector_size)

# Column for embeddings
tweets['extra_data'] = tweets['id_author'].apply(get_embedding)

tweets.head(2)

In [None]:
# Save DataFrame as a pickle file
#tweets.to_pickle('node2vec_df.pkl')

tweets = pd.read_pickle('../models/node2vec_df.pkl')
tweets

## TADW

Text Associated Deep Walk
Enfoque que utiliza atributos textuales de los nodos (en este caso One Hot Encoding) sin hacer uso de convoluciones y usando un enfoque matricial.

- Representar la estructura del grafo como una matriz M (basada en las conexiones del grafo)
- Descomponer M en dos partes:

    Una matriz W, que representa las relaciones entre los nodos según la estructura del grafo.
    Una matriz H, que está conectada a T y captura la información textual.

Resolver un problema matemático que ajusta W y H al mismo tiempo, de modo que ambas matrices trabajen juntas para combinar estructura y texto.

In [None]:
import pandas as pd
users=pd.read_csv('../data/MIND/users.csv')
users= users[["id_author",'polarity']]

tweets=pd.read_csv('../data/MIND/tweets.csv')

network=pd.read_csv('../data/MIND/rt_network.csv')
network= network[['id','user','retweetedUser','weight']]
network = pd.merge(network, users, left_on='user', right_on='id_author', how='left')
network.drop('id_author', axis=1, inplace=True)
network

In [None]:
import pandas as pd
import networkx as nx

#Grafo
G = nx.Graph()

for _, row in network.iterrows():
    G.add_edge(
        row['user'],                # Source node
        row['retweetedUser'],       # Target node
        weight=row['weight'],       # Edge attribute: weight
        polarity=row['polarity']    # Edge attribute: polarity
    )

#Añadir atributos a los nodos (polaridad)
node_polarity = {}
for u, v, data in G.edges(data=True):
    if u not in node_polarity:
        node_polarity[u] = data['polarity']
    if v not in node_polarity:
        node_polarity[v] = data['polarity']


nx.set_node_attributes(G, node_polarity, name='polarity')

#codificar atributos
polarity_map = {'neg': [1, 0], 'pos': [0, 1]}
features = {node: polarity_map[data['polarity']] for node, data in G.nodes(data=True)}

#matriz de atributos
feature_matrix = [features[node] for node in G.nodes()]


In [None]:
#modelo TADW
from tadw import *
tadw = TADW(graph=G, features=feature_matrix, dim=80, lamb=0.2)

tadw.learn_embeddings()

embeddings = tadw.get_embeddings()

user_embeddings = {node: embeddings[i] for i, node in enumerate(G.nodes())}

tweets['extra_data'] = tweets['id_author'].apply(lambda x: user_embeddings.get(x, None))


In [None]:
# Save DataFrame as a pickle file
tweets.to_pickle('../models/tweets_tadw_df2.pkl')

tweets = pd.read_pickle('../models/tadw_df2.pkl')
tweets

# **DATASET2**


Tweets Health Care Reform
Cite: Mukhija, S. Twitter Polarity Classification with Label Propagation over Lexical Links and the Follower Graph.Health Care Reform
Cite: Mukhija, S. Twitter Polarity Classification with Label Propagation over Lexical Links and the Follower Graph.


In [None]:
tweets = pd.read_pickle("../data/HCR/final_hcr_morality.pkl")
print('HCR DATASET')
print('Moral labels')
print(tweets['moral_label'].value_counts())
print('------')
print('Sentiment labels')
print(tweets['label'].value_counts())

In [None]:
#---1617 tweets---
tweets = pd.read_csv("../data/HCR/hcr.tweets.polarity.tsv", sep="\t")
#-- 470 tweets con target hcr --- tweets[tweets['target']=='hcr']

# --- 837 tweets etiquetados y 598 usuarios----- 
tweets=tweets[['tweet.id','user.id','author.nickname','content','sentiment','target']]
tweets = tweets.dropna(subset=['sentiment'])
tweets = tweets[tweets['sentiment'].isin(['positive', 'negative', 'neutral'])]
tweets.sentiment.unique()

In [None]:
network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")

#---- hay información sobre 1603 usuarios (más que de os que tenemos textos)
print(len(network['from'].unique()), len(network['to'].unique()))

In [None]:
import networkx as nx

network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')

import matplotlib.pyplot as plt
import networkx as nx

# Subgraph with matching users
subG = G

# Use spring layout for visualization
pos = nx.spring_layout(subG, seed=42)

plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(subG, pos, node_size=50, node_color='green', alpha=0.7)
nx.draw_networkx_edges(subG, pos, alpha=0.3, edge_color='gray')
plt.title("Follower Network (Dataset 2) - Users with Tweets")
plt.axis('off')
plt.show()

## SVD

In [None]:
import networkx as nx
from sknetwork.embedding import SVD
network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')
A = nx.adjacency_matrix(G).todense()

svd = SVD(32)
embedding = svd.fit_transform(A)

nodes = list(G.nodes())
df = pd.DataFrame({
    'node': nodes,
    'embedding': [embedding[i].tolist() for i in range(len(nodes))]})

df.head()
print(len(df['node'].unique()))

In [None]:
#tweets con etiqueta que sea positivo, negativo o neutral

tweets = pd.read_csv("../data/HCR/hcr.tweets.polarity.tsv", sep="\t")
tweets=tweets[['tweet.id','user.id','author.nickname','content','sentiment','target']]
tweets = tweets.dropna(subset=['sentiment'])
tweets = tweets[tweets['sentiment'].isin(['positive', 'negative', 'neutral'])]
#print(len(tweets['user.id'].unique()))

#quedarnos con datos de los cuales tengamos la informacion del usuario
df2=pd.merge(df,tweets, left_on='node', right_on='user.id', how='inner')
df2.drop(columns=['node'], inplace=True)
df2.rename(columns={'embedding':'extra_data'}, inplace=True)

len(df2['user.id'].unique())


In [None]:
#df2.to_pickle('hcr_svd_df.pkl')

## DeepWalk

In [None]:
from karateclub import DeepWalk
import networkx as nx

tweets = pd.read_csv("../data/HCR/hcr.tweets.polarity.tsv", sep="\t")
tweets=tweets[['tweet.id','user.id','author.nickname','content','sentiment','target']]
tweets = tweets.dropna(subset=['sentiment'])
tweets = tweets[tweets['sentiment'].isin(['positive', 'negative', 'neutral'])]
#print(len(tweets['user.id'].unique()))


network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')
mapping = {node: i for i, node in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)

model = DeepWalk(dimensions=32, walk_length=30, workers=4)
model.fit(G)
embeddings = model.get_embedding()
embeddings

reverse_mapping = {v: k for k, v in mapping.items()} 
node_embeddings = {reverse_mapping[i]: embeddings[i] for i in range(len(embeddings))}
len(node_embeddings.keys())

In [None]:
df = pd.DataFrame(list(node_embeddings.items()), columns=['user', 'extra_data'])
tweets = tweets.merge(df, how='inner', left_on='user.id', right_on='user')
tweets.drop(columns=['user'], inplace=True)
print(len(tweets['user.id'].unique()))

#tweets.to_pickle('hcr_deepwalk_df.pkl')
matching_users= tweets['user.id'].unique()


In [None]:
#tweets.to_pickle('hcr_deepwalk_df.pkl')


## Node2Vec

In [None]:
import networkx as nx
from node2vec import Node2Vec

tweets = pd.read_csv("../data/HCR/hcr.tweets.polarity.tsv", sep="\t")
tweets=tweets[['tweet.id','user.id','author.nickname','content','sentiment','target']]
tweets = tweets.dropna(subset=['sentiment'])
print(len(tweets['user.id'].unique()))
tweets = tweets[tweets['sentiment'].isin(['positive', 'negative', 'neutral'])]
print(len(tweets['user.id'].unique()))


network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')

node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, workers=1)  
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
node_embeddings = {node: model.wv[str(node)] for node in list(G.nodes())}
print(len(node_embeddings.keys()))

In [None]:
import numpy as np
def get_embedding(node):
    try:
        return model.wv[str(node)]
    except KeyError:
        return np.nan
# Column for embeddings
tweets['extra_data'] = tweets['user.id'].apply(get_embedding)
tweets= tweets.dropna(subset=['extra_data'])

print(len(tweets['user.id'].unique()))


In [None]:
tweets.to_pickle('hcr_node2vec_df.pkl')


## TADW

In [None]:
import pandas as pd
#seleccionar la polaridad de cada usuario
polarity = pd.read_csv("../data/HCR/hcr.user.polarity.hcr.tsv", sep="\t")
polarity["polarity"] = polarity.iloc[:, 1:].idxmax(axis=1)
polarity= polarity[['user.id','polarity']]
polarity = polarity[polarity['user.id'].isin(matching_users)]
polarity.rename(columns={'polarity': 'user_polarity'}, inplace=True)
polarity['user_polarity'] = polarity['user_polarity'].replace({'irrelevant': 'neutral', 'unsure': 'neutral'})
polarity['user_polarity'].value_counts()

In [None]:
#asociar a los usuarios que tenemos en la red su polaridad, buscar su polaridad en from y to
network = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")

# Añadir la polaridad de 'from'
network2 = pd.merge(network, polarity, left_on='from', right_on='user.id', how='left')
network2.rename(columns={'user_polarity': 'from_polarity'}, inplace=True)
network2.drop(columns=['user.id'], inplace=True)  # Eliminar la columna duplicada

# Añadir la polaridad de 'to'
network2 = pd.merge(network2, polarity, left_on='to', right_on='user.id', how='left')
network2.rename(columns={'user_polarity': 'to_polarity'}, inplace=True)
network2.drop(columns=['user.id'], inplace=True)  # Eliminar la columna duplicada

# Unir ambas polaridades en un solo conjunto único de usuarios
unique_tuples = pd.concat([
    network2[['from', 'from_polarity']].rename(columns={'from': 'user.id', 'from_polarity': 'user_polarity'}),
    network2[['to', 'to_polarity']].rename(columns={'to': 'user.id', 'to_polarity': 'user_polarity'})
]).drop_duplicates()

# Mostrar las primeras filas para verificar
unique_tuples= unique_tuples.dropna(subset=['user_polarity'])
unique_tuples

In [None]:
#asociar polaridad de los usuarios y añadirlo a el df tweets, si un usuario no tiene polaridad ponemos neutral

tweets = pd.read_csv("../data/HCR/hcr.tweets.polarity.tsv", sep="\t")
tweets=tweets[['tweet.id','user.id','author.nickname','content','sentiment','target']]
tweets = tweets.dropna(subset=['sentiment'])
tweets = tweets[tweets['sentiment'].isin(['positive', 'negative', 'neutral'])]

#ahora uno con la polaridad que tengo de los usuarios (algunos no tienen le pondremos neutral)
tweets=pd.merge(tweets,unique_tuples, left_on='user.id', right_on='user.id', how='left')
tweets['user_polarity'] = tweets['user_polarity'].fillna('neutral')

print(len(tweets['user.id'].unique()), len(tweets['tweet.id'].unique()))
tweets.sentiment.unique()

In [None]:
import networkx as nx

G = nx.Graph()

for _, row in network.iterrows():
    G.add_edge(row['from'], row['to'])

# Asignar polaridad a los nodos
user_polarity_dict = tweets.set_index("user.id")["user_polarity"].to_dict()
node_polarity = {node: user_polarity_dict.get(node, 'neutral') for node in G.nodes()}
nx.set_node_attributes(G, node_polarity, name="user_polarity")

# Codificar atributos de polaridad
polarity_map = {'negative': [1, 0, 0], 'positive': [0, 1, 0], 'neutral': [0, 0, 1]}
features = {node: polarity_map[data['user_polarity']] for node, data in G.nodes(data=True)}

# Crear matriz de atributos (ordenada según los nodos en el grafo)
feature_matrix = [features[node] for node in G.nodes()]
feature_matrix


In [None]:
#modelo TADW NECESITA pip install networkx==2.7
from tadw import *
tadw = TADW(graph=G, features=feature_matrix, dim=80, lamb=0.2)

tadw.learn_embeddings()

embeddings = tadw.get_embeddings()

user_embeddings = {node: embeddings[i] for i, node in enumerate(G.nodes())}

tweets['extra_data'] = tweets['user.id'].apply(lambda x: user_embeddings.get(x, None))
tweets

In [None]:
tweets= tweets.dropna(subset=['extra_data'])

print(len(tweets['user.id'].unique()))

In [None]:
#tweets.to_pickle('hcr_tadw_df.pkl')

# **DATASET3**

Tweets 2008 Presidential Debates (Obama MCain)

Cite: David A. Shamma, Lyndon Kennedy, and Elizabeth F. Churchill. 2009. Tweet the debates: understanding community annotation of uncollected sources. In Proceedings of the first SIGMM workshop on Social media (WSM '09). Association for Computing Machinery, New York, NY, USA, 3–10. https://doi.org/10.1145/1631144.1631148

In [None]:
tweets = pd.read_pickle("../data/OMC/final_omc_morality.pkl")
print('OMC DATASET')
print('Moral labels')
print(tweets['moral_label'].value_counts())
print('------')
print('Sentiment labels')
print(tweets['label'].value_counts())

In [None]:
import pandas as pd
# ----- hay 2678 anotaciones de textos------
tweets = pd.read_csv("../data/OMC/shamma.content.tweets.polarity.majority.tsv", sep="\t")

# ----- hay 2518 textos recuperados--------
#hay más anotaciones que textos disponibles
texts = pd.read_csv("../data/OMC/shamma.content.tweets.recovered.merged.tsv", sep="\t")
texts =texts[['tweet.id','user.id','content','dataset.author.name']]

#hay textos no etiquetados con el majority pero si poseen los ratings (no hay etiqueta agregada por empate)
tweets=pd.merge(texts,tweets, left_on='tweet.id', right_on='tweet.id', how='left')

#se eliminan aquellos tweets que no tengan etiqueta agregada
#--- hay 2071 tweets y 791 usuarios----ver de cuales tenemos información en la red
tweets= tweets.dropna(subset=['majority_polarity'])
print(len(tweets['user.id'].unique()), len(tweets['tweet.id'].unique()))
print(tweets.majority_polarity.value_counts())

In [None]:
network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")

#---- hay información sobre 635 usuarios, ver de los cuales tenemos textos
print(len(network['from'].unique()), len(network['to'].unique()))

In [None]:

network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")

G = nx.from_pandas_edgelist(network, 'from', 'to')

nodos = set(G.nodes)  
usuarios = set(tweets['user.id'].unique())
matching_users = nodos.intersection(usuarios)

# --- información de 604 usuarios---
filtered_tweets = tweets[tweets['user.id'].isin(matching_users)]

print(f"Total de nodos en el grafo: {len(nodos)}")
print(f"Usuarios en común (matching_users): {len(matching_users)}")
print(f"Filas en filtered_df: {len(filtered_tweets)}")


In [None]:
import matplotlib.pyplot as plt
import networkx as nx

subG = G.subgraph(matching_users).copy()

pos = nx.fruchterman_reingold_layout(subG, seed=42)

plt.figure(figsize=(12, 12))

nx.draw_networkx_edges(subG, pos, alpha=0.2, edge_color='gray', width=1)
nx.draw_networkx_nodes(subG, pos, node_size=80, node_color='skyblue', alpha=0.7)

# No etiquetas para mantener limpio

plt.title("User network (Dataset 3) - Fruchterman-Reingold layout")
plt.axis('off')
plt.show()


## SVD

In [None]:
import networkx as nx
from sknetwork.embedding import SVD

network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')
A = nx.adjacency_matrix(G).todense()

svd = SVD(32)
embedding = svd.fit_transform(A)


nodes = list(G.nodes())
df = pd.DataFrame({
    'node': nodes,
    'embedding': [embedding[i].tolist() for i in range(len(nodes))]})


In [None]:
tweets = pd.read_csv("../data/OMC/shamma.content.tweets.polarity.majority.tsv", sep="\t")

texts = pd.read_csv("../data/OMC/shamma.content.tweets.recovered.merged.tsv", sep="\t")
texts =texts[['tweet.id','user.id','content','dataset.author.name']]
tweets=pd.merge(texts,tweets, left_on='tweet.id', right_on='tweet.id', how='left')
tweets= tweets.dropna(subset=['majority_polarity'])

df2=pd.merge(df,tweets, left_on='node', right_on='user.id', how='inner')
df2.drop(columns=['node'], inplace=True)
df2.rename(columns={'embedding':'extra_data'}, inplace=True)
len(df2['user.id'].unique())

In [None]:
import pandas as pd
df2 = pd.read_pickle('omc_svd_df.pkl')
df2

In [None]:
#visualizar red

#fig, ax = plt.subplots(figsize=(15, 9))
#ax.axis("off")
#plot_options = {"node_size": 10, "with_labels": False, "width": 0.15}
#nx.draw_networkx(G, pos=nx.random_layout(G), ax=ax, **plot_options)

#pos = nx.spring_layout(G, iterations=15, seed=1721)
#fig, ax = plt.subplots(figsize=(15, 9))
#ax.axis("off")
#nx.draw_networkx(G, pos=pos, ax=ax, **plot_options)


## Deepwalk

In [None]:
from karateclub import DeepWalk
import networkx as nx
import pandas as pd

tweets = pd.read_csv("../data/OMC/shamma.content.tweets.polarity.majority.tsv", sep="\t")

texts = pd.read_csv("../data/OMC/shamma.content.tweets.recovered.merged.tsv", sep="\t")
texts =texts[['tweet.id','user.id','content','dataset.author.name']]
tweets=pd.merge(texts,tweets, left_on='tweet.id', right_on='tweet.id', how='left')
tweets= tweets.dropna(subset=['majority_polarity'])
print(len(tweets['user.id'].unique()), len(tweets['tweet.id'].unique()))


network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')
mapping = {node: i for i, node in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)

model = DeepWalk(dimensions=32, walk_length=30, workers=4)
model.fit(G)
embeddings = model.get_embedding()
embeddings

reverse_mapping = {v: k for k, v in mapping.items()} 
node_embeddings = {reverse_mapping[i]: embeddings[i] for i in range(len(embeddings))}

In [None]:
df = pd.DataFrame(list(node_embeddings.items()), columns=['user', 'extra_data'])
tweets = tweets.merge(df, how='inner', left_on='user.id', right_on='user')
tweets.drop(columns=['user'], inplace=True)
print(len(tweets['user.id'].unique()))


In [None]:
#tweets.to_pickle('omc_deepwalk_df.pkl')

## Node2vec

In [None]:
import networkx as nx
from node2vec import Node2Vec

tweets = pd.read_csv("../data/OMC/shamma.content.tweets.polarity.majority.tsv", sep="\t")

texts = pd.read_csv("../data/OMC/shamma.content.tweets.recovered.merged.tsv", sep="\t")
texts =texts[['tweet.id','user.id','content','dataset.author.name']]
tweets=pd.merge(texts,tweets, left_on='tweet.id', right_on='tweet.id', how='left')
tweets= tweets.dropna(subset=['majority_polarity'])
print(len(tweets['user.id'].unique()), len(tweets['tweet.id'].unique()))

network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")
G = nx.from_pandas_edgelist(network, 'from', 'to')

node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, workers=1)  
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
node_embeddings = {node: model.wv[str(node)] for node in list(G.nodes())}


In [None]:
import numpy as np
def get_embedding(node):
    try:
        return model.wv[str(node)]
    except KeyError:
        return np.nan
# Column for embeddings
tweets['extra_data'] = tweets['user.id'].apply(get_embedding)
tweets= tweets.dropna(subset=['extra_data'])

print(len(tweets['user.id'].unique()))


In [None]:
#tweets.to_pickle('omc_node2vec_df.pkl')

## TADW

In [None]:
import pandas as pd

polarity = pd.read_csv("../data/OMC/shamma.users.polarity.majority.tsv", sep="\t")
polarity = polarity[polarity['user.id'].isin(matching_users)]
polarity.rename(columns={'polarity': 'user_polarity'}, inplace=True)


In [None]:
network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")

# Añadir la polaridad de 'from'
network2 = pd.merge(network, polarity, left_on='from', right_on='user.id', how='left')
network2.rename(columns={'user_polarity': 'from_polarity'}, inplace=True)
network2.drop(columns=['user.id'], inplace=True)  # Eliminar la columna duplicada

# Añadir la polaridad de 'to'
network2 = pd.merge(network2, polarity, left_on='to', right_on='user.id', how='left')
network2.rename(columns={'user_polarity': 'to_polarity'}, inplace=True)
network2.drop(columns=['user.id'], inplace=True)  # Eliminar la columna duplicada

# Unir ambas polaridades en un solo conjunto único de usuarios
unique_tuples = pd.concat([
    network2[['from', 'from_polarity']].rename(columns={'from': 'user.id', 'from_polarity': 'user_polarity'}),
    network2[['to', 'to_polarity']].rename(columns={'to': 'user.id', 'to_polarity': 'user_polarity'})
]).drop_duplicates()

# Mostrar las primeras filas para verificar
unique_tuples= unique_tuples.dropna(subset=['user_polarity'])

In [None]:
# selecciono los textos cuyos usuarios se que tengo
tweets = pd.read_csv("../data/OMC/shamma.content.tweets.polarity.majority.tsv", sep="\t")
texts = pd.read_csv("../data/OMC/shamma.content.tweets.recovered.merged.tsv", sep="\t")
texts =texts[['tweet.id','user.id','content','dataset.author.name']]
tweets=pd.merge(texts,tweets, left_on='tweet.id', right_on='tweet.id', how='left')
tweets= tweets.dropna(subset=['majority_polarity'])
tweets = tweets[tweets['user.id'].isin(matching_users)]

#ahora uno con la polaridad que tengo de los usuarios (algunos no tienen le pondremos neutral)
tweets=pd.merge(tweets,unique_tuples, left_on='user.id', right_on='user.id', how='left')
tweets['user_polarity'] = tweets['user_polarity'].fillna('neutral')
print(len(tweets['user.id'].unique()), len(tweets['tweet.id'].unique()))


In [None]:
network = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")

G = nx.Graph()

for _, row in network.iterrows():
    G.add_edge(row['from'], row['to'])

# Asignar polaridad a los nodos
user_polarity_dict = tweets.set_index("user.id")["user_polarity"].to_dict()
node_polarity = {node: user_polarity_dict.get(node, 'neutral') for node in G.nodes()}
nx.set_node_attributes(G, node_polarity, name="user_polarity")

polarity_map = {'neg': [1, 0, 0], 'pos': [0, 1, 0], 'neutral': [0, 0, 1]}
features = {node: polarity_map[data['user_polarity']] for node, data in G.nodes(data=True)}

feature_matrix = [features[node] for node in G.nodes()]

feature_matrix[:5]

In [None]:
#modelo TADW
from tadw import *
tadw = TADW(graph=G, features=feature_matrix, dim=80, lamb=0.2)

tadw.learn_embeddings()

embeddings = tadw.get_embeddings()

user_embeddings = {node: embeddings[i] for i, node in enumerate(G.nodes())}

tweets['extra_data'] = tweets['user.id'].apply(lambda x: user_embeddings.get(x, None))
tweets

In [None]:
#tweets.to_pickle('omc_tadw_df.pkl')

# **Gráficas**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set style
sns.set(style="whitegrid")

# Sentiment data
sentiment_data = {
    'Dataset': ['RT-MIND', 'RT-MIND', 'HCR', 'HCR', 'HCR', 'OMC', 'OMC'],
    'Label': ['negative', 'positive', 'negative', 'positive', 'neutral', 'negative', 'positive'],
    'Count': [89, 70, 371, 149, 132, 1051, 716]
}
df_sentiment = pd.DataFrame(sentiment_data)
df_sentiment['Percent'] = df_sentiment.groupby('Dataset')['Count'].transform(lambda x: 100 * x / x.sum())

# Morality data
morality_data = {
    'Dataset': ['RT-MIND']*6 + ['HCR']*6 + ['OMC']*6,
    'Label': ['Care', 'Fairness', 'Authority', 'Loyalty', 'Purity', 'NM']*3,
    'Count': [11, 15, 26, 9, 1, 97, 87, 73, 63, 48, 8, 370, 116, 74, 116, 79, 13, 1392]
}
df_morality = pd.DataFrame(morality_data)
df_morality['Percent'] = df_morality.groupby('Dataset')['Count'].transform(lambda x: 100 * x / x.sum())

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Normalized Sentiment Plot
sns.barplot(data=df_sentiment, x='Dataset', y='Percent', hue='Label', ax=axes[0])
axes[0].set_title('Normalized Sentiment Label Distribution (%)')
axes[0].set_ylabel('Percentage')
axes[0].legend(title='Sentiment')

# Normalized Morality Plot
sns.barplot(data=df_morality, x='Dataset', y='Percent', hue='Label', ax=axes[1])
axes[1].set_title('Normalized Moral Foundation Label Distribution (%)')
axes[1].set_ylabel('Percentage')
axes[1].legend(title='Moral Label')

plt.tight_layout()
plt.savefig('normalized_sentiment_morality.pdf', bbox_inches='tight')
plt.savefig('class_distribution.png', bbox_inches='tight')
plt.show()


In [None]:
df_morality_pct = df_morality.copy()
df_morality_pct['Percent'] = df_morality_pct.groupby('Dataset')['Count'].transform(lambda x: 100 * x / x.sum())

plt.figure(figsize=(8, 6))
sns.barplot(data=df_morality_pct, x='Dataset', y='Percent', hue='Label')
plt.title('Normalized Moral Foundation Label Distribution (%)')
plt.legend(title='Moral Label')
plt.ylabel('Percentage')
plt.tight_layout()
plt.savefig('morality_distribution_normalized.pdf', bbox_inches='tight')
plt.show()


## GRAFOS

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, 1, figsize=(12, 36))

# Dataset 1
users = pd.read_csv('../data/MIND/users.csv')
users = users[["id_author", 'polarity']]
network1 = pd.read_csv('../data/MIND/rt_network.csv')
network1 = network1[['id', 'user', 'retweetedUser', 'weight']]
network1 = pd.merge(network1, users, left_on='user', right_on='id_author', how='left')
network1.drop('id_author', axis=1, inplace=True)

G1 = nx.from_pandas_edgelist(network1, 'user', 'retweetedUser', edge_attr='weight')
pos1 = nx.spring_layout(G1, seed=42)
nx.draw_networkx_nodes(G1, pos1, node_size=20, node_color='blue', alpha=0.7, ax=axs[0])
nx.draw_networkx_edges(G1, pos1, alpha=0.3, edge_color='gray', ax=axs[0])
axs[0].set_title("Retweet Network (Dataset 1)")
axs[0].axis('off')

# Dataset 2
network2 = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G2 = nx.from_pandas_edgelist(network2, 'from', 'to')
pos2 = nx.spring_layout(G2, seed=42)
nx.draw_networkx_nodes(G2, pos2, node_size=20, node_color='green', alpha=0.7, ax=axs[1])
nx.draw_networkx_edges(G2, pos2, alpha=0.3, edge_color='gray', ax=axs[1])
axs[1].set_title("Follower Network (Dataset 2)")
axs[1].axis('off')

# Dataset 3
network3 = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")
G3 = nx.from_pandas_edgelist(network3, 'from', 'to')
pos3 = nx.spring_layout(G3, seed=42)
nx.draw_networkx_nodes(G3, pos3, node_size=20, node_color='red', alpha=0.7, ax=axs[2])
nx.draw_networkx_edges(G3, pos3, alpha=0.3, edge_color='gray', ax=axs[2])
axs[2].set_title("Follower Network (Dataset 3)")
axs[2].axis('off')

plt.tight_layout()
plt.savefig('user_graphs.pdf', bbox_inches='tight')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, 1, figsize=(8, 18))  # más compacto

# Dataset 1
users = pd.read_csv('../data/MIND/users.csv')
users = users[["id_author", 'polarity']]
network1 = pd.read_csv('../data/MIND/rt_network.csv')
network1 = network1[['id', 'user', 'retweetedUser', 'weight']]
network1 = pd.merge(network1, users, left_on='user', right_on='id_author', how='left')
network1.drop('id_author', axis=1, inplace=True)

G1 = nx.from_pandas_edgelist(network1, 'user', 'retweetedUser', edge_attr='weight')
pos1 = nx.spring_layout(G1, seed=42)
nx.draw_networkx_nodes(G1, pos1, node_size=10, node_color='blue', alpha=0.7, ax=axs[0])
nx.draw_networkx_edges(G1, pos1, alpha=0.2, edge_color='gray', ax=axs[0])
axs[0].set_title("Retweet Network (Dataset 1)")
axs[0].axis('off')

# Dataset 2
network2 = pd.read_csv("../data/HCR/hcr.relations.follower.inner.tsv", sep="\t")
G2 = nx.from_pandas_edgelist(network2, 'from', 'to')
pos2 = nx.spring_layout(G2, seed=42)
nx.draw_networkx_nodes(G2, pos2, node_size=10, node_color='green', alpha=0.7, ax=axs[1])
nx.draw_networkx_edges(G2, pos2, alpha=0.2, edge_color='gray', ax=axs[1])
axs[1].set_title("Follower Network (Dataset 2)")
axs[1].axis('off')

# Dataset 3
network3 = pd.read_csv("../data/OMC/shamma.relations.user.user.follower.inner.tsv", sep="\t")
G3 = nx.from_pandas_edgelist(network3, 'from', 'to')
pos3 = nx.spring_layout(G3, seed=42)
nx.draw_networkx_nodes(G3, pos3, node_size=10, node_color='red', alpha=0.7, ax=axs[2])
nx.draw_networkx_edges(G3, pos3, alpha=0.2, edge_color='gray', ax=axs[2])
axs[2].set_title("Follower Network (Dataset 3)")
axs[2].axis('off')

plt.tight_layout()
plt.savefig('user_graphs.pdf', bbox_inches='tight')
plt.show()


# **Baselines HUGGING FACE**

In [None]:
import os
import re
import json
import warnings
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import networkx as nx
#import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, RobertaForSequenceClassification, AutoTokenizer,AutoModelForSequenceClassification, AutoModel, RobertaForSequenceClassification

In [None]:
seed=42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
#CLEAN DATA

def cleaner1(tweet):
    # remove usernames
    # tweet = re.sub("@[A-Za-z0-9]+","",tweet)
    tweet = tweet.lower()
    tweet = re.sub("^rt", "", tweet)
    tweet = re.sub("\s[0-9]+\s", "", tweet)
    # remove usernames
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("at_user", "", tweet)
    # remove urls
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    tweet = tweet.replace("url", "")
    tweet = tweet.strip()
    tweet = " ".join(tweet.split())
    return tweet


    
#BINARY LABELS SENT
def binary_labels(df):
    df['label'] = df['label'].map({'neg': 0, 'pos': 1}).astype('Int64')
    id2label = {0: "NEG", 1: "POS"}
    label2id = {"NEG": 0, "POS": 1}
    return df, id2label, label2id

def multi_labels(df):
    df= df.replace({"label": {"negative": 0, "positive": 1,"neutral":2}})
    id2label = {0: "NEGATIVE", 1: "POSITIVE", 2:"NEUTRAL"}
    label2id = {"NEGATIVE": 0, "POSITIV": 1, "NEUTRAL":2}
    return df, id2label, label2id
    
#MULTICLASS LABELS MORAL
def label_multiclass6(df):
    df= df.replace({'label': {'care': 1, 'harm': 1,
                                'fairness': 2,'cheating': 2,
                                'loyalty': 3,'betrayal': 3,
                                'authority': 4,'subversion': 4,
                                 'purity': 5,'degradation': 5,'nonmoral': 0,'nomoral': 0
                                }})
    
    id2label = {0:"NONMORAL", 1:"CARE" ,1:"HARM",2:"FAIRNESS",2:"CHEATING",3:"LOYALTY",3:"BETRAYAL",4:"AUTHORITY",4:"SUBVERSION",5:"PURITY",5:"DEGRADATION"}
    label2id = {"NONMORAL":0, "CARE": 1,"HARM":1,"FAIRNESS":2,"CHEATING":2,"LOYALTY":3,"BETRAYAL":3,"AUTHORITY":4,"SUBVERSION":4,"PURITY":5,"DEGRADATION":5}

    return df, id2label,label2id 

    
# METRICS
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

# TOKENIZER
def tokenize_function(examples):
    tokenized_inputs=tokenizer(examples["text"], truncation=True)
    return tokenized_inputs

## BERT/ROBERTA BASELINES HUGGING FACE


In [None]:
tweets = pd.read_pickle("../data/OMC/final_omc_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
tweets['label']=tweets['moral_label']

tweets.label.value_counts()

In [None]:
#classic text classiication
#df, id2label, label2id = binary_labels(tweets)
#df, id2label, label2id = multi_labels(tweets)
df, id2label, label2id = label_multiclass6(tweets)

#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows
training_args = TrainingArguments(
    output_dir = '/model/',
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='no',
seed=42)

#-----split data-----
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}
datasets = DatasetDict(datasets)
test_df.label.unique()

In [None]:
#----model--------
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", truncation=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True)


model = RobertaForSequenceClassification.from_pretrained(
    "FacebookAI/roberta-base", 
    num_labels=6, 
    id2label=id2label, 
    label2id=label2id
)


model =  BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=6, 
    id2label=id2label, 
    label2id=label2id
)

# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
#train and test
try:
    trainer.train()
except Exception as e:
    print(e)

    
predictions = trainer.predict(tokenized_datasets["test"])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets["test"]["label"]
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)
results

In [None]:
results_file = '../data/OMC/F1_results/roberta-base-moral'
experiment= "baseline"

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset OMC: \n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)

results

## Llama + LoRA BASELINE HUGGING FACE

In [None]:
tweets = pd.read_pickle("../data/OMC/final_omc_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
tweets['label']=tweets['moral_label']
tweets.label.value_counts()

In [None]:
import peft
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    # https://github.com/huggingface/peft/issues/96#issuecomment-1460080427
    TrainerCallback, TrainerState, TrainerControl, 
    LlamaForSequenceClassification,
    LlamaForSequenceClassification,
    EarlyStoppingCallback
)
import torch
from peft import LoraConfig, TaskType, PeftModel
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

device = torch.device("cuda")
assert torch.cuda.is_available()

In [None]:
from huggingface_hub import login

#access_token = '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'  # REPLACE WITH ACCESS TOKEN
#login(access_token)

# Initialize HuggingFace model
#from huggingface_hub import login
#access_token = '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'  # REPLACE WITH ACCESS TOKEN
#login()

In [None]:
model_name = "meta-llama/Llama-3.2-1B"
#--------- data -----------
df, id2label, label2id= label_multiclass6(tweets)
#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows

#df, id2label, label2id= multi_labels(tweets)
#df, id2label, label2id = binary_labels(tweets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}

datasets = DatasetDict(datasets)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)
test_df.label.unique()

In [None]:
# --------- model ------------
quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"


model = LlamaForSequenceClassification.from_pretrained(model_name,
    num_labels=6,
    quantization_config=quantization_config,
    low_cpu_mem_usage = True
)
model.config.pad_token_id = model.config.eos_token_id

tokenizer.pad_token_id = model.config.pad_token_id
tokenized_datasets = datasets.map(tokenize_function, batched=True)
model.add_adapter(peft_config, adapter_name="adapter_1")

In [None]:
test_df.label.value_counts()

In [None]:
truncation = True
max_length = 2000


training_args = TrainingArguments(
    output_dir = "out/",
    learning_rate=2e-4,
    num_train_epochs=10,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='epoch',
    #save_safetensors=True,
    #load_best_model_at_end = True,
    #report_to="none",
)

trainer = Trainer(
    model=model,  #
    args=training_args,  
    train_dataset=tokenized_datasets["train"],  
    eval_dataset=tokenized_datasets["val"],  
    tokenizer=tokenizer,  
    compute_metrics=compute_metrics,  
   
)

In [None]:
#train and test

try:
    trainer.train()
except Exception as e:
    print(e)

predictions = trainer.predict(tokenized_datasets["test"])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets["test"]["label"]
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)
results

In [None]:

results_file = '../data/OMC/F1_results/llama-3.2-1b-moral'
experiment= "baseline"

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset OMC: \n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)

results

## DEEPSEEK BASELINE HUGGING FACE

In [None]:
import peft
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    # https://github.com/huggingface/peft/issues/96#issuecomment-1460080427
    TrainerCallback, TrainerState, TrainerControl, 
    LlamaForSequenceClassification,
    LlamaForSequenceClassification,
    EarlyStoppingCallback
)
import torch
from peft import LoraConfig, TaskType, PeftModel
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

device = torch.device("cuda")
assert torch.cuda.is_available()

In [None]:
tweets = pd.read_pickle("../data/MIND/final_tweets_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
#tweets['label']=tweets['moral_label']
tweets.label.value_counts()

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

#--------- data -----------
#df, id2label, label2id= label_multiclass6(tweets)
#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows

#df, id2label, label2id= multi_labels(tweets)
df, id2label, label2id = binary_labels(tweets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}

datasets = DatasetDict(datasets)
test_df.label.unique()

In [None]:
from transformers import Qwen2ForSequenceClassification
quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)

#----model--------
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)
tokenizer.pad_token = "[PAD]"


model = Qwen2ForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id,
    quantization_config=quantization_config,
    low_cpu_mem_usage = True
)

model.config.pad_token_id = model.config.eos_token_id

tokenizer.pad_token_id = model.config.pad_token_id
tokenized_datasets = datasets.map(tokenize_function, batched=True)
model.add_adapter(peft_config, adapter_name="adapter_1")

truncation = True
max_length = 2000



training_args = TrainingArguments(
    output_dir = "out/",
    learning_rate=2e-4,
    num_train_epochs=10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='epoch',
    #save_safetensors=True,
    #load_best_model_at_end = True,
    #report_to="none",
)

# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
model

In [None]:
trainer.train()


In [None]:
#train and test

#try:
#    trainer.train()
#except Exception as e:
#    print(e)

predictions = trainer.predict(tokenized_datasets["test"])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets["test"]["label"]
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)
results

In [None]:
#save results
import json
results_file = '../data/MIND/F1_results/DeepSeek-Qwen'
experiment= "baseline"

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset POZZI: \n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)
