In [1]:
import os
import time
import json
import pickle
import ast
import pandas as pd

import openai

from utils import get_completion, read_social_listening_data, join_text_batch, save_pandas_object, clean_text
from credentials import OPENAI_API_KEY

openai.api_key = OPENAI_API_KEY

In [2]:
# Configurar parámetros
topics_batch_size = 30
sentiment_batch_size = 50
num_topicos = 3
column_name = "text"
id_column = "ID"
cliente = "bancolombia_080623"

In [3]:
# Lectura de los datos
df = read_social_listening_data(cliente)
print("Número de filas inicial:", len(df))
df.drop_duplicates([column_name], inplace=True)
print("Número de filas después de eliminar duplicados:", len(df))

Ruta destino...../data/bancolombia_080623
Número de filas inicial: 1200
Número de filas después de eliminar duplicados: 1018


In [4]:
# Definir ID
if id_column in df.columns:
    print(f"La columna ID {(id_column)} ya existe")
else:
    print("Se crea una columna ID")
    df[id_column] = range(0, len(df))
    
df.head()

Se crea una columna ID


Unnamed: 0,created_at,text,public_metrics,context_annotations,entities,referenced_tweets,ID
0,2023-06-01T19:00:10.000Z,#MundialSub20 | ¡GOOOOL de Uruguay! A los 65' ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '11', 'name': 'Sport', 'des...","{'annotations': [{'start': 1, 'end': 12, 'prob...","[{'type': 'replied_to', 'id': '166434107126079...",0
1,2023-06-01T19:00:18.000Z,#MundialSub20 \n\nGoooool de Uruguay\n\nAnders...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,"{'annotations': [{'start': 1, 'end': 12, 'prob...",,1
2,2023-06-01T19:00:26.000Z,Que golazo para @Uruguay! Vamos!!\n#MundialSub20,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '11', 'name': 'Sport', 'des...","{'annotations': [{'start': 35, 'end': 46, 'pro...",,2
3,2023-06-01T19:00:45.000Z,Gol carajo !!!! Vamo los guachos !!! #uruguays...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,"{'annotations': [{'start': 38, 'end': 49, 'pro...",,3
4,2023-06-01T19:00:49.000Z,RT @marcadorec: Quién es Christian García Caja...,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",,"{'annotations': [{'start': 25, 'end': 46, 'pro...","[{'type': 'retweeted', 'id': '1664322399800811...",4


In [6]:
# Limpiar el texto
df["text"] = clean_text(df, "text")

### Análisis de Tópicos

In [None]:
response = []

In [None]:
%%time
for i in range(0, len(df), topics_batch_size):
    text_batch = df[i : (i + topics_batch_size)][column_name].tolist()
    str_text_batch = join_text_batch(text_batch)

    prompt = f"""
        Determine máximo {num_topicos} tópicos para \
        cada una de las frases a continuación: \

        {str_text_batch}

        Cada tópico debe ser de máximo tres palabras.
        El resultado debe ser un JSON con cada frase y su lista de tópicos.

        """

    response_i = get_completion(prompt)
    response.append(response_i)
    
    if i % 100 == 0:
        print("Iteración:", i) 
        time.sleep(10)

In [None]:
with open("topic_list.pickle", "wb") as file:
    pickle.dump(response, file)

In [None]:
with open("topic_list.pickle", "rb") as file:
    loaded_list = pickle.load(file)

In [None]:
# Cambiar las llaves de los diccionarios por el ID correspondiente
new_dict = {}

for i in range(0, len(response)):
    if i != 25: # verificar, esto no puede suceder en todos los casos
        dict_i = json.loads(''.join(response[i].splitlines()))
        ini = i * topics_batch_size
        fin = topics_batch_size * (i + 1)
        new_keys = df[ini : fin].ID.tolist()

        new_dict_i = {}
        for i, (key, value) in enumerate(dict_i.items(), start=0):
            new_key = new_keys[i]
            new_dict_i[new_key] = value

        new_dict.update(new_dict_i)

In [None]:
# Calcular número de tópicos por ID
element_lengths = []

for value in new_dict.values():
    element_lengths.append(len(value))

print("Print max: ", max(element_lengths))
print("Print min: ", min(element_lengths))

In [None]:
cual = list(map(lambda x: x < 3, element_lengths))
shorter_keys = pd.Series(new_dict.keys())[cual]
print("Número de llaves con menos de 3 tópicos:", len(shorter_keys))

In [None]:
# Volver todas las listas de tamaño 3
for key in new_dict.keys():
    
    if key in shorter_keys:
        values = new_dict[key]
        len_ = len(values)
        new_dict[key] = new_dict[key] + [""] * (max(element_lengths) - len_)

In [None]:
# Calcular número de tópicos por ID
element_lengths = []

for value in new_dict.values():
    element_lengths.append(len(value))

print("Print max: ", max(element_lengths))
print("Print min: ", min(element_lengths))

In [None]:
# Casos particulares
new_dict[801] = new_dict[801] + [""]
new_dict[802] = new_dict[802] + [""]
new_dict[804] = new_dict[804] + [""]

In [None]:
# Calcular número de tópicos por ID
element_lengths = []

for value in new_dict.values():
    element_lengths.append(len(value))

print("Print max: ", max(element_lengths))
print("Print min: ", min(element_lengths))

In [None]:
# Añadir fecha de creación del tweet
topics_df = pd.DataFrame(new_dict)
topics_df = topics_df.transpose()
topics_df.reset_index(inplace=True)
topics_df.columns = ["ID", "Topicp_1", "Topico_2", "Topico_3"]
topics_df = topics_df.merge(df[["ID", "created_at"]])
topics_df.head()

In [None]:
# Añadir métricas públicas
public_metrics = df.public_metrics.map(lambda s: ast.literal_eval(s)) 
public_metrics_df = pd.DataFrame(public_metrics.tolist())
public_metrics_df["ID"] = df["ID"]
topics_df = topics_df.merge(public_metrics_df)

In [None]:
# Guardar DataFrame procesado 
save_pandas_object(topics_df, root_path="../artifacts", subfolder="bancolombia_080623", name="tweet_social_listening_analysis.csv")

### Análisis de sentimiento

In [None]:
response = []

In [None]:
%%time
for i in range(0, len(df), sentiment_batch_size):
    text_batch = df[i : (i + sentiment_batch_size)].text.tolist()
    str_text_batch = join_text_batch(text_batch)

    prompt = f"""
        Determine el sentimiento para \
        cada una de las frases a continuación: \

        {str_text_batch}

        El resultado debe ser un JSON con cada frase y su sentimiento.

        """

    response_i = get_completion(prompt)
    response.append(response_i)
    
    if i % 100 == 0:
        print("Iteración:", i)
        time.sleep(15)

In [None]:
with open("sentiment_list.pickle", "wb") as file:
    pickle.dump(response, file)

In [None]:
with open("sentiment_list.pickle", "rb") as file:
    loaded_list = pickle.load(file)