In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
# Load datasets
db1 = pd.read_csv('coem_authors_enriched.csv')
# db2 = pd.read_csv('embeddings2.csv')
# db3 = pd.read_csv('embeddings3.csv')

In [3]:
db1.columns

Index(['UUID', 'Author', 'LastName', 'Name', 'Birth Year', 'Death Year',
       'Author_name', 'Summary', 'Wiki_URL', 'cats', 'linked_authors',
       'country', 'genera'],
      dtype='object')

In [4]:
authors = db1[['Author_name', 'linked_authors']]
authors.head()

Unnamed: 0,Author_name,linked_authors
0,Manuel Acuña,"Francisco Sosa Escalante,José Martí"
1,,
2,Delmira Agustini,Rubén Darío
3,Anna Ajmátova,"Aleksandr Blok,Giacomo Leopardi,Joseph Brodsky..."
4,,


In [5]:
# prompt: Convert all nan on db1['linked_authors'] and db1['Author_name'] to empty strings to all the column is strings

db1['linked_authors'] = db1['linked_authors'].fillna('')
db1['Author_name'] = db1['Author_name'].fillna('')


In [6]:
# prompt: A list of unique countries in db1

unique_countries = db1['country'].unique().tolist()
print(unique_countries)

# Get index of country and default 0
get_country_index = lambda country: unique_countries.index(country) if country in unique_countries else 0
country_index = get_country_index('fsdfd')
print(country_index)


['Mexico', 'Uruguay', 'Rusia', 'Arabia', 'Al-Ándalus', 'España', 'Inglaterra', 'Italia', 'Puerto Rico', 'Portugal', 'Francia', nan, 'Guatemala', 'Grecia', 'Chile', 'Cuba', 'Irlanda', 'Venezuela', 'Argentina', 'Alemania', 'Estados Unidos', 'Republica Dominicana', 'Paraguay', 'Nicaragua', 'Colombia', 'Ecuador', 'Romano', 'Griego', 'Peru', 'Dinamarca', 'Argentino', 'El Salvador', 'Vietnam', 'Boliviano', 'Finlandia', 'Serbia', 'Costa Rica', 'Italiano', 'Austria', 'Brasil', 'Panamá', 'Honduras', 'Austríaco', 'Persa', 'Nigeria', 'India', 'Polonia', 'Japon', 'Gales', 'Palestina', 'Rumania', 'China', 'Belgica', 'África', 'Oriente', 'Estadounidense', 'Noruega', 'Suiza', 'Cubano', 'Bolivia', 'Suecia', 'Escocia', 'Líbano', 'Puertorriqueño', 'Sudáfrica', 'Prusia', 'Checoslovaquia', 'Reino Unido', 'Alemán', 'Inglesa', 'Egipto', 'Nueva Zelandia', 'Canadá', 'Trinidad y Tobago', 'Hungria', 'Chileno', 'Nueva', 'República', 'Spain', 'Estados', 'Santa', 'Sierra', 'Mónaco', 'Cabo', 'Marruecos', 'Republica

In [7]:
db1.columns

Index(['UUID', 'Author', 'LastName', 'Name', 'Birth Year', 'Death Year',
       'Author_name', 'Summary', 'Wiki_URL', 'cats', 'linked_authors',
       'country', 'genera'],
      dtype='object')

## Creating Linked Json

In [14]:
# prompt: Create a JSON with db1['Author_name'] and db['linked_authors'] fitting an object {nodes, links} where linked authors are separated by comma and iterated on

import json

nodes = []
links = []

for i, row in db1.iterrows():
    author_name = row['Author_name']
    linked_authors = row['linked_authors'].split(',')

    # if nan: continue
    if author_name == 'Nan' or author_name == '':
        continue

    # if author name already in node
    if author_name in [node['id'] for node in nodes]:
        continue

    """
    ['UUID', 'Author', 'LastName', 'Name', 'Birth Year', 'Death Year',
        'Author_name', 'Summary', 'Wiki_URL', 'cats', 'linked_authors',
        'country', 'genera']
    """

    # Create a node for the author
    node = {
        'id': author_name,
        'group': get_country_index(row['country']),
        'country': str(row['country']),
        'birth_year': str(row['Birth Year']),
        'death_year': str(row['Death Year']),
        'linked_authors': len(linked_authors)
    }
    nodes.append(node)

    # Create links for each linked author
    for linked_author in linked_authors:
        link = {
            'source': author_name,
            'target': linked_author,
            'value': 1
        }
        links.append(link)

# Create the JSON object
data = {
    'nodes': nodes,
    'links': links
}

# Save the JSON object to a file
with open('authorLinks.json', 'w') as outfile:
    json.dump(data, outfile,ensure_ascii=False)


In [15]:
for i in range(10):
    print(data['nodes'][i])

for i in range(10):
    print(data['links'][i])

{'id': 'Manuel Acuña', 'group': 0, 'country': 'Mexico', 'genera': 'Romanticismo', 'birth_year': '1849', 'death_year': '1873', 'linked_authors': 2}
{'id': 'Delmira Agustini', 'group': 1, 'country': 'Uruguay', 'genera': 'Modernismo', 'birth_year': '1886', 'death_year': '1914', 'linked_authors': 1}
{'id': 'Anna Ajmátova', 'group': 2, 'country': 'Rusia', 'genera': 'Romanticismo', 'birth_year': '1889', 'death_year': '1966', 'linked_authors': 6}
{'id': 'Rafael Alberti', 'group': 5, 'country': 'España', 'genera': 'Romanticismo', 'birth_year': '1902', 'death_year': '1999', 'linked_authors': 16}
{'id': 'Richard Aldington', 'group': 6, 'country': 'Inglaterra', 'genera': 'nan', 'birth_year': '1892', 'death_year': '1962', 'linked_authors': 1}
{'id': 'Vicente Aleixandre', 'group': 5, 'country': 'España', 'genera': 'nan', 'birth_year': '1898', 'death_year': '1984', 'linked_authors': 65}
{'id': 'Dante Alighieri', 'group': 7, 'country': 'Italia', 'genera': 'Romanticismo', 'birth_year': '1265', 'death_

In [10]:
# prompt: Go through the data and create a dataReduced only with the targets that exist in the nodes

dataReduced = {'nodes': [node for node in data['nodes']], 'links': []}

# Add links to dataReduced if their target exists in nodes
for link in data['links']:
    if link['target'] in [n['id'] for n in nodes]:
        dataReduced['links'].append(link)

# Print the reduced data
print(dataReduced)


{'nodes': [{'id': 'Manuel Acuña', 'group': 0, 'country': 'Mexico', 'genera': 'Romanticismo', 'name': 'Manuel', 'birth_year': '1849', 'death_year': '1873', 'linked_authors': 2}, {'id': 'Delmira Agustini', 'group': 1, 'country': 'Uruguay', 'genera': 'Modernismo', 'name': 'Delmira', 'birth_year': '1886', 'death_year': '1914', 'linked_authors': 1}, {'id': 'Anna Ajmátova', 'group': 2, 'country': 'Rusia', 'genera': 'Romanticismo', 'name': 'Anna', 'birth_year': '1889', 'death_year': '1966', 'linked_authors': 6}, {'id': 'Rafael Alberti', 'group': 5, 'country': 'España', 'genera': 'Romanticismo', 'name': 'Rafael', 'birth_year': '1902', 'death_year': '1999', 'linked_authors': 16}, {'id': 'Richard Aldington', 'group': 6, 'country': 'Inglaterra', 'genera': 'nan', 'name': 'Richard', 'birth_year': '1892', 'death_year': '1962', 'linked_authors': 1}, {'id': 'Vicente Aleixandre', 'group': 5, 'country': 'España', 'genera': 'nan', 'name': 'Vicente', 'birth_year': '1898', 'death_year': '1984', 'linked_aut

In [11]:
print(len(dataReduced['links']))
print(len(data['links']))


5171
5587


In [12]:
# Save the JSON object to a file
with open('authorLinksSmaller.json', 'w') as outfile:
    json.dump(dataReduced, outfile,ensure_ascii=False)

## Story Embeddings

In [None]:
db3.head(1)

Unnamed: 0,dim1,dim2,dim3,vector_id,uuid_story,author_uuid,story_name,link,length,words,reading_time_min,coordinates
0,1628848076,-8153986931,8296456337,04d4cd4b-8f57-4910-8cff-cfd34ca6bf46,04d4cd4b-8f57-4910-8cff-cfd34ca6bf46,9d50c449-ce8b-43f1-a0c1-53be4177dc93,Las vocales de colores,https://ciudadseva.com/texto/las-vocales-de-co...,1765,305,1525,"[16.28848076, -8.153986931, 8.296456337]"


In [None]:
# prompt: convert db3 cols dim1, dim2, dim3 into a new one with float coordinates. They are currently strings like "16,0566546", so replace , with . and eval
df = pd.DataFrame()

df['story_name'] = db3['story_name']
df['x'] = [ float(d.replace(',', '.')) for d in db3['dim1'] ]
df['y'] = [ float(d.replace(',', '.')) for d in db3['dim2'] ]
df['z'] = [ float(d.replace(',', '.')) for d in db3['dim3'] ]
df.head()

Unnamed: 0,story_name,x,y,z,size
0,Las vocales de colores,16.288481,-8.153987,8.296456,0.1
1,Cuánto se divertían,16.784241,-6.143554,10.079655,0.1
2,Todo lo contrario,19.959745,-9.638548,8.536081,0.1
3,La carne,18.058767,-8.713723,10.635745,0.1
4,La rana gritona y el león,19.603294,-7.778275,12.218028,0.1


In [None]:
import plotly.express as px

# Big figure plotly
fig = px.scatter_3d(df, x='x', y='y', z='z',text='story_name')
fig.update_traces(mode = 'markers')
fig.show()

In [None]:
fig = px.scatter(df,'x','y',text='story_name')
fig.update_traces(mode = 'markers')