# Modelo que determina la polaridad de una frase

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Función para interactuar con el usuario de manera mejorada
def interactuar_con_usuario():
    # Instrucciones claras para el usuario
    print("Bienvenido a la aplicación de análisis de sentimientos.")
    print("Por favor, ingresa una frase para analizar su sentimiento.")

    # Solicitar al usuario que ingrese una frase
    while True:
        try:
            text = input("Tu entrada: ")
            if not text:
                raise ValueError("Por favor, ingresa una frase válida.")
            break
        except ValueError as e:
            print(e)

    text = preprocess(text)

    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

# Ejecutar la interacción con el usuario
interactuar_con_usuario()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Bienvenido a la aplicación de análisis de sentimientos.
Por favor, ingresa una frase para analizar su sentimiento.
Tu entrada: Today is hard
1) negative 0.6613
2) neutral 0.273
3) positive 0.0657


# Extracción de comentarios de Reddit

In [2]:
import praw
import pandas as pd

reddit = praw.Reddit(client_id="I80D4CWWdnBqVU_WEiZKjQ", 
                     client_secret="pkLPDsbb8sn34LjFpP8gT-_GyQhqCA",
                     username="Andres_Molano_Ostos", 
                     password="pamito197", 
                     user_agent="Scrapper 1.0 by /u/Andres_Molano_Ostos")

# Subreddit y límite de comentarios
subreddit = reddit.subreddit('Travel')
limit = 1

data = []
for submission in subreddit.top(limit=limit):
    submission.comments.replace_more(limit=1) # Para asegurarse de que se carguen todos los comentarios
    for comment in submission.comments.list():
        data.append({
            'title': submission.title,
            'score': submission.score,
            'created_utc': submission.created_utc,
            'num_comments': submission.num_comments,
            'url': submission.url,
            'comment_id': comment.id,
            'comment_body': comment.body,
            'comment_score': comment.score,
            'comment_created_utc': comment.created_utc
        })

df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,title,score,created_utc,num_comments,url,comment_id,comment_body,comment_score,comment_created_utc
0,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgnvj1,Thread locked due to the high number of rule-b...,1,1.614721e+09
1,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpem2p4,Did you have a tour guide with you at all time...,826,1.614686e+09
2,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpem0pi,Looks like perfectly normal photographs from t...,5684,1.614686e+09
3,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpeo6jj,There's a Samsung AC condenser unit.,347,1.614687e+09
4,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gperfs6,Only the leaders suffer from obesity..,383,1.614690e+09
...,...,...,...,...,...,...,...,...,...
581,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpghgbh,His profile says he is active in r/australianp...,11,1.614718e+09
582,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgan8y,Honestly I did wonder about that.,7,1.614715e+09
583,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgl10v,yes they use it for plants as well.,8,1.614720e+09
584,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpg06pw,I misread.,6,1.614711e+09


# Combinación

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

# ... (Código para cargar el modelo y funciones de procesamiento de texto)

# Ajustar la longitud máxima de la secuencia de entrada
max_sequence_length = 128  # Puedes ajustar esto según tus necesidades

def obtener_porcentajes_sentimiento(texto):
    texto = preprocess(texto)
    encoded_input = tokenizer(texto, return_tensors='pt', max_length=max_sequence_length, truncation=True, padding=True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

# Aplicar la función a cada fila del DataFrame y crear las columnas correspondientes
df['sentimiento_scores'] = df['comment_body'].apply(obtener_porcentajes_sentimiento)

# Dividir los scores en columnas separadas
df[['sentimiento_positivo', 'sentimiento_neutral', 'sentimiento_negativo']] = pd.DataFrame(df['sentimiento_scores'].tolist(), index=df.index)

# Eliminar la columna 'sentimiento_scores' si ya no se necesita
df = df.drop(columns=['sentimiento_scores'])


In [6]:
df

Unnamed: 0,title,score,created_utc,num_comments,url,comment_id,comment_body,comment_score,comment_created_utc,sentimiento_positivo,sentimiento_neutral,sentimiento_negativo
0,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgnvj1,Thread locked due to the high number of rule-b...,1,1.614721e+09,0.378245,0.599691,0.022064
1,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpem2p4,Did you have a tour guide with you at all time...,826,1.614686e+09,0.013115,0.912336,0.074549
2,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpem0pi,Looks like perfectly normal photographs from t...,5684,1.614686e+09,0.027856,0.431779,0.540365
3,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpeo6jj,There's a Samsung AC condenser unit.,347,1.614687e+09,0.021405,0.763809,0.214785
4,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gperfs6,Only the leaders suffer from obesity..,383,1.614690e+09,0.799163,0.188452,0.012386
...,...,...,...,...,...,...,...,...,...,...,...,...
581,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpghgbh,His profile says he is active in r/australianp...,11,1.614718e+09,0.042842,0.885059,0.072099
582,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgan8y,Honestly I did wonder about that.,7,1.614715e+09,0.195447,0.728520,0.076032
583,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpgl10v,yes they use it for plants as well.,8,1.614720e+09,0.008736,0.735500,0.255765
584,"I visited North Korea recently, these are some...",56804,1.614684e+09,2768,https://www.reddit.com/gallery/lvzqh4,gpg06pw,I misread.,6,1.614711e+09,0.388594,0.569510,0.041896


# Prompt

In [31]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Cargar el modelo preentrenado y el tokenizador de GPT-2
modelo = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizador = GPT2Tokenizer.from_pretrained("gpt2")

# Definir un prompt
prompt = "Love is love"

# Tokenizar el prompt
input_ids = tokenizador.encode(prompt, return_tensors="pt")

# Generar una secuencia de texto completando el prompt
output = modelo.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decodificar la secuencia generada
texto_generado = tokenizador.decode(output[0], skip_special_tokens=True)

print(texto_generado)



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Love is love.

I'm not sure if I'm going to be able to say this, but I do know that I love you. I know you're a wonderful person. You're the best person I've ever known. And I want to thank you for everything you've done for me. It's been a long time coming. But I can't wait to see you again.


# Combinación

In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
from scipy.special import softmax
import pandas as pd

# Código para cargar el modelo de análisis de sentimiento
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Código para cargar el modelo GPT-2 y el tokenizador
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ajustar la longitud máxima de la secuencia de entrada
max_sequence_length = 128

# Función para obtener los porcentajes de análisis de sentimiento
def obtener_porcentajes_sentimiento(texto):
    texto = preprocess(texto)
    encoded_input = tokenizer(texto, return_tensors='pt', max_length=max_sequence_length, truncation=True, padding=True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

# Función para generar un resumen de comentarios
def generar_resumen_comentarios(df, tipo_sentimiento):
    # Ordenar el DataFrame por comment_score en orden descendente y tomar los 10 comentarios con mayor puntaje
    comentarios_seleccionados = df.sort_values(by='comment_score', ascending=False).head(1)
    comentarios_seleccionados = comentarios_seleccionados['comment_body'].tolist()
    
    resumen = ""
    
    for comentario in comentarios_seleccionados:
        resumen += comentario + "\n"
    
    return resumen

# ... (Código para cargar los comentarios en el DataFrame df)

# Solicitar al usuario el tipo de resumen que desea
print("¿Qué tipo de resumen de comentarios deseas?")
print("1. Resumen de comentarios positivos")
print("2. Resumen de comentarios negativos")
print("3. Resumen de comentarios neutros")

while True:
    try:
        opcion = int(input("Ingresa el número de la opción: "))
        if opcion not in [1, 2, 3]:
            raise ValueError("Por favor, ingresa una opción válida.")
        break
    except ValueError as e:
        print(e)

if opcion == 1:
    tipo_sentimiento = 'sentimiento_positivo'
elif opcion == 2:
    tipo_sentimiento = 'sentimiento_negativo'
else:
    tipo_sentimiento = 'sentimiento_neutral'

# Generar el resumen de los 10 comentarios con mayor puntaje
resumen = generar_resumen_comentarios(df, tipo_sentimiento)

# Generar un resumen con GPT-2
prompt = "A continuación se muestra un resumen de comentarios:"
input_ids = gpt2_tokenizer.encode(prompt + "\n" + resumen, return_tensors="pt")
output = gpt2_model.generate(input_ids, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2)
resumen_gpt2 = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

# Imprimir el resumen generado por GPT-2
print("\nResumen generado por GPT-2:")
print(resumen_gpt2)



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


¿Qué tipo de resumen de comentarios deseas?
1. Resumen de comentarios positivos
2. Resumen de comentarios negativos
3. Resumen de comentarios neutros
Ingresa el número de la opción: 1


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Resumen generado por GPT-2:
A continuación se muestra un resumen de comentarios:
Looks like perfectly normal photographs from the 70s.
I'm not sure if this is a coincidence or not, but I'm sure it's a good thing. I think it is the first time I've seen a photograph of a man in a suit. It's not like I was expecting it to be a very good photograph. The man is wearing a white suit, and he's wearing the same clothes as the other men. He's in his suit and his hair is in the middle of the suit; he looks like he is going to have a lot of fun. And I thought it was a great photograph, because it shows the man's face.
