# CSV to TTN
## Input: Les_mis_dataset.csv
## Output: Temporal text network

In [5]:
import uunet.multinet as ml
import pandas as pd
import re
from transformers import pipeline

In [6]:
def to_df(dic):
    return pd.DataFrame.from_dict(dic)

def initialise_ttn():
    '''
    initialises empty ttn
    '''
    # initialize network
    ttn = ml.empty()
    
    # add layer to network
    ml.add_layers(ttn, ["persons","messages"])
    
    # direct edges
    dir_ = { "layer1": ["persons"], "layer2": ["messages"], "dir": [True] }
    ml.set_directed(ttn, dir_)
    
    # set attributes for layers
    ml.add_attributes(ttn, ["text", "addressed", "not_addressed","sentiment-label"], target = "vertex", layer = "messages", type="string")
    #ml.add_attributes(ttn, ["text", "sentiment-label"], target = "vertex", layer = "messages", type="string")
    ml.add_attributes(ttn, ["start","end","sentiment-score"], target = "vertex", layer = "messages", type="numeric")
    
    # set attributes for edges (RuntimeError: attributes on inter-layer edges are not available in this version of the library)
    #ml.add_attributes(ttn, ["start"], target = "edge", layer1 = "persons", layer2 = "messages") 

    return ttn

def get_addressed_and_not(row):
    # get addressed
    if isinstance(row['addressed'], str): 
        addressed = row['addressed']
    else:
        addressed = '-'
    
    # get not_addressed
    if isinstance(row['all_recipients'], str): 
        recipients = row['all_recipients'].split(',')
        not_addressed = [r for r in recipients if r not in addressed]
        if len(not_addressed) == 0:
            not_addressed = '-'
        else:
            not_addressed = ','.join(not_addressed)
    else:
        not_addressed = '-'

    return addressed, not_addressed

def fill_ttn(df,ttn):
    '''
    adds data to ttn
    '''
    message_index = 0
    for i, row in df.iterrows():
    
        speakers = row['speaker'].split(',')
        for speaker in speakers:

            speaker = speaker.replace(" ","")# remove space because of bug in uunet
            # add speaker-vertice
            speaker_vertice = {"actor": [speaker], "layer": ['persons']}
            ml.add_vertices(ttn, speaker_vertice)   
            
            # get addressed and not_addressed
            addressed, not_addressed = get_addressed_and_not(row)

            addressed = addressed.replace(" ", "")# remove space because of bug in uunet
            not_addressed = not_addressed.replace(" ", "")# remove space because of bug in uunet
            
            # add message
            message = str(message_index)
            message_index += 1
            message_vertice = {"actor": [message], "layer": ['messages']}
            ml.add_vertices(ttn, message_vertice)
            ml.set_values(ttn, "start", vertices = message_vertice, values = [row['start']])
            ml.set_values(ttn, "end", vertices = message_vertice, values = [row['end']])
            ml.set_values(ttn, "addressed", vertices = message_vertice, values = [addressed])
            ml.set_values(ttn, "not_addressed", vertices = message_vertice, values = [not_addressed])

            
            # remove quotation marks because of bug with uunet
            text = row['text'].replace('"', '')
            ml.set_values(ttn, "text", vertices = message_vertice, values = [text])

            # calc and set sentiment
            sentiment = sentiment_pipeline([text])[0]
            label = sentiment['label']
            score = sentiment['score']
            ml.set_values(ttn, "sentiment-label", vertices = message_vertice, values = [label])
            ml.set_values(ttn, "sentiment-score", vertices = message_vertice, values = [score])           
            
            # add speaker-message edges
            edge = {"from_actor": [speaker], "from_layer": ['persons'], "to_actor": [message], "to_layer": ['messages']}
            ml.add_edges(ttn, edge)

            # add recipients and message-recipient edges
            recipients = row["all_recipients"]
            if isinstance(recipients, str): 
                recipients = recipients.split(',')
                recipients = [a.replace(" ","") for a in recipients]# remove space because of bug in uunet
                # add receiver-vertice
                recipients_vertices = {"actor": recipients, "layer": ['persons']*len(recipients)}
                ml.add_vertices(ttn, recipients_vertices)
                # addd message-receiver edges
                edges = {"from_actor": [message]*len(recipients), "from_layer": ['messages']*len(recipients), 
                         "to_actor": recipients, "to_layer": ['persons']*len(recipients)}
                ml.add_edges(ttn, edges)
            

    
def test_msg(ttn,message):
    '''
    prints message data (from,to, and attributes)
    ''' 
    # get speaker --> message --> reciever
    edges = pd.DataFrame.from_dict(ml.edges(ttn))
    speaker = list(edges.loc[edges['to_actor'] == message, 'from_actor'])
    reciever = list(edges.loc[edges['from_actor'] == message, 'to_actor'])
    print('message:')
    print(f'{speaker} --> {message} --> {reciever}')
    
    # print attributes of message
    print(f'\n{message} attributes:')
    print(ml.get_values(ttn, "text", vertices={'actor': [message], 'layer':['messages']}))
    #print(ml.get_values(ttn, "id", vertices={'actor': [message], 'layer':['messages']}))
    print(ml.get_values(ttn, "start", vertices={'actor': [message], 'layer':['messages']}))
    print(ml.get_values(ttn, "end", vertices={'actor': [message], 'layer':['messages']}))
    print(ml.get_values(ttn, "addressed", vertices={'actor': [message], 'layer':['messages']}))
    print(ml.get_values(ttn, "not_addressed", vertices={'actor': [message], 'layer':['messages']}))

In [9]:
# main

# get sentiment model
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

# load data
df = pd.read_csv("data/les_mis_dataset.csv", sep=";")

# initialise and fill
ttn = initialise_ttn()
fill_ttn(df,ttn)

# test
# print('***** ttn_combined *****\n')
# test_msg(ttn,'800')
# display(pd.DataFrame.from_dict(ml.edges(ttn)))
# print('\n\n\n')

# save ttn
save_path = 'data/ttn_nets/ttn'
ml.write(n = ttn,file = save_path + '.txt', format = "multilayer")
ml.write(n = ttn,file = save_path + '.graphml', format = "graphml")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
dict_ = ml.summary(ttn)
df = pd.DataFrame.from_dict(dict_)
df

Unnamed: 0,layer,n,m,dir,nc,slc,dens,cc,apl,dia
0,messages,1064,0,False,1064,1,0,0,0,0
1,persons,124,0,False,124,1,0,0,0,0
