# Preparação

In [None]:
import numpy as np

%pip install -U tensorflow tensorflow_hub sentence-transformers openai



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# determina os caminhos das pastas que contém os arquivos de interesse
project_folder = "/content/drive/Shareddrives/Projeto de Estatística/"

article_folder = project_folder + "TXTs Artigos (Completos)/"
abstract_folder = project_folder + "TXTs Abstracts/"

ab_OG_folder = abstract_folder + "ORIGINAL/"

ab_Bard_folder = abstract_folder + "BARD/"
ab_Bing_folder = abstract_folder + "BING/"
ab_GPT_folder = abstract_folder + "CHATGPT/"

# cria listas com os nomes dos arquivos de interesse
article_files = []
ab_OG_files = []

ab_Bard_files = []
ab_Bing_files = []
ab_GPT_files = []

# preenchendo as listas
for i in range(1, 151):
  num = ""
  if(i < 100): num += "0"
  if(i < 10): num += "0"
  num += str(i)

  article_files.append(article_folder + "No_Abstract" + num + ".txt")
  ab_OG_files.append(ab_OG_folder + "Abstract_OG" + num + ".txt")

  ab_Bard_files.append(ab_Bard_folder + "Abstract_BARD" + num + ".txt")
  ab_Bing_files.append(ab_Bing_folder + "Abstract_BING" + num + ".txt")
  ab_GPT_files.append(ab_GPT_folder + "Abstract_GPT" + num + ".txt")

In [None]:
OG_article_texts = []
OG_abstract_texts = []
Bard_texts = []
Bing_texts = []
GPT_texts = []

for i in range(150):
  file_OG_article = open(article_files[i])
  OG_article_texts.append(file_OG_article.read())
  file_OG_article.close()

  file_OG_abstract = open(ab_OG_files[i])
  OG_abstract_texts.append(file_OG_abstract.read())
  file_OG_abstract.close()

  file_Bard = open(ab_Bard_files[i])
  Bard_texts.append(file_Bard.read())
  file_Bard.close()

  file_Bing = open(ab_Bing_files[i])
  Bing_texts.append(file_Bing.read())
  file_Bing.close()

  file_GPT = open(ab_GPT_files[i])
  GPT_texts.append(file_GPT.read())
  file_GPT.close()

# Pré-processamento

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def cos_sim(sentence1_emb, sentence2_emb):
    """
    Similaridade de cosseno entre duas colunas de embeddings de sentenças

    Argumentos:
      sentence1_emb: coluna do embedding de sentence1
      sentence2_emb: coluna do embedding de sentence2

    Returns:
      A similaridade de cosseno das linhas entre as duas colunas

      Se sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Então o resultado é [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

# Comparação

[Semantic Textual Similarity](https://towardsdatascience.com/semantic-textual-similarity-83b3ca4a840e)

Universal Semantic Encoder (USE) escolhido por ser o mais preciso que consegue lidar com textos longos.


In [None]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [None]:
abstracts_list = [OG_abstract_texts, Bard_texts, Bing_texts, GPT_texts]

# com isso definimos
#   0 = OG
#   1 = Bard
#   2 = Bing
#   3 = GPT


article_embeddings = embed(OG_article_texts)
abstracts_embeddings = [embed(abstracts) for abstracts in abstracts_list]

In [None]:
similarities = []

for i in range(4):
    abstract_set = abstracts_embeddings[i]
    similarities_set = cosine_similarity(article_embeddings, abstract_set)
    similarities.append(similarities_set)

# 'similarities' contém uma lista de 4 matrizes de similaridade, uma equivalente a cada abstract
# similarities[i][j][j] diz a similaridade do abstract j+1 do tipo i com o artigo j+1 correspondente

# Salvando os Resultados

In [None]:
import os

results_folder = project_folder + "Análise/Resultados"

if not os.path.exists(results_folder):
    os.makedirs(results_folder)

results_folder += '/'

In [None]:
# para os abstracts originais

filename = results_folder + "Comparações_OG.txt"
OG = open(filename, "w")

for i in range(150):
  value = "{:,.6f}".format(similarities[0][i][i]).replace(".", ",")
  OG.write(value + "\n")

OG.close()

In [None]:
# para os abstracts Bard

filename = results_folder + "Comparações_BARD.txt"
BARD = open(filename, "w")

for i in range(150):
  value = "{:,.6f}".format(similarities[1][i][i]).replace(".", ",")
  BARD.write(value + "\n")

BARD.close()

In [None]:
#para os abstracts Bing

filename = results_folder + "Comparações_BING.txt"
BING = open(filename, "w")

for i in range(150):
  value = "{:,.6f}".format(similarities[2][i][i]).replace(".", ",")
  BING.write(value + "\n")

BING.close()

In [None]:
# para os abstracts GPT

filename = results_folder + "Comparações_GPT.txt"
GPT = open(filename, "w")

for i in range(150):
  value = "{:,.6f}".format(similarities[3][i][i]).replace(".", ",")
  GPT.write(value + "\n")

GPT.close()