In [2]:
import os
import re
import pandas as pd

# Carpeta amb els arxius .tex
folder_path = "../data/output_tex"

# Patrons per capturar diferents tipus de blocs
patterns = {
    "theorem": r"\\begin\{theorem\}(.*?)\\end\{theorem\}",
    "definition": r"\\begin\{definition\}(.*?)\\end\{definition\}",
    "proof": r"\\begin\{proof\}(.*?)\\end\{proof\}",
    "proposition": r"\\begin\{proposition\}(.*?)\\end\{proposition\}",
    "lemma": r"\\begin\{lemma\}(.*?)\\end\{lemma\}",
    "example": r"\\begin\{example\}(.*?)\\end\{example\}",
}

# Llista per guardar els blocs extrets
extracted_blocks = []

# Iterar per cada arxiu .tex de la carpeta
for filename in os.listdir(folder_path):
    if filename.endswith(".tex"):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            latex_text = f.read()

        article_id = os.path.splitext(filename)[0]  # treure l'extensió .tex

        for label, pattern in patterns.items():
            matches = re.findall(pattern, latex_text, re.DOTALL)
            for match in matches:
                clean_text = match.strip()
                if len(clean_text) > 20:  # ignorar fragments trivials
                    extracted_blocks.append((label, clean_text, article_id))

# Crear el DataFrame amb la nova columna
df_blocks = pd.DataFrame(extracted_blocks, columns=["type", "content", "article_id"])


In [3]:
print(df_blocks.shape)
df_blocks.head()

(1763, 3)


Unnamed: 0,type,content,article_id
0,theorem,\label{thm_lzero}\nThe approximate peak time f...,2502.01037
1,theorem,\label{thm_linfty}\nThe approximate peak time ...,2502.01037
2,definition,\label{def_lzero}\nWe define approximate peak ...,2502.01037
3,definition,\label{def_linfty}\nAssume \eqref{ellcondition...,2502.01037
4,proof,We first study the asymptotic behavior of the ...,2502.01037


In [4]:
import pandas as pd
import re

def clean_latex(text):
    # Eliminar comandos LaTeX irrelevantes
    text = re.sub(r'\\label\{.*?\}', '', text)
    text = re.sub(r'\\begin\{.*?\}', '', text)
    text = re.sub(r'\\end\{.*?\}', '', text)
    text = text.replace('\n', ' ')  # Unificar texto multilinea
    return text.strip()

#df_sample["clean_content"] = df_sample["content"].apply(clean_latex)


In [5]:
from itertools import combinations
import pandas as pd

def generar_parejas(df,column):
    parejas = []

    # Agrupar per article i tipus
    for (article_id, tipo), sub_df in df.groupby(["article_id", "type"]):
        for a, b in combinations(sub_df[column], 2):
            parejas.append((a, b, 1))

    return pd.DataFrame(parejas, columns=["text_a", "text_b", "label"])


In [6]:
df_blocks["content"] = df_blocks["content"].apply(clean_latex)
df_blocks.head()

Unnamed: 0,type,content,article_id
0,theorem,The approximate peak time for a small fluoresc...,2502.01037
1,theorem,The approximate peak time for a large fluoresc...,2502.01037
2,definition,We define approximate peak time $t^{p}_0$ by t...,2502.01037
3,definition,Assume \eqref{ellcondition1} and \eqref{ellcon...,2502.01037
4,proof,We first study the asymptotic behavior of the ...,2502.01037


In [7]:
df_blocks.to_csv("df_blocks.csv", index=False)


In [28]:
parelles_positives =  generar_parejas(df_blocks, column = "content")
print(parelles_positives.shape)


(11091, 3)


In [29]:
from itertools import product
import pandas as pd
import random

def generar_parejas_negativas_fortes(df, num_positives, column, ratio=2, seed=42):
    random.seed(seed)
    negatives = []

    # Generar totes les parelles negatives candidates
    for a1 in df["article_id"].unique():
        for a2 in df["article_id"].unique():
            if a1 == a2:
                continue
            df1 = df[df["article_id"] == a1]
            df2 = df[df["article_id"] == a2]

            for t1 in df1["type"].unique():
                for t2 in df2["type"].unique():
                    if t1 == t2:
                        continue
                    sub1 = df1[df1["type"] == t1]
                    sub2 = df2[df2["type"] == t2]
                    for x, y in product(sub1[column], sub2[column]):
                        negatives.append((x, y, 0))

    # Barrejar i reduir segons la proporció
    random.shuffle(negatives)
    num_negatives = min(len(negatives), num_positives * ratio)
    sampled_negatives = negatives[:num_negatives]

    return pd.DataFrame(sampled_negatives, columns=["text_a", "text_b", "label"])


In [30]:
parelles_negatives = generar_parejas_negativas_fortes(df_blocks, num_positives=len(parelles_positives), ratio=2, column = "content")  # negatives fortes: tipus diferent i article diferent
print(parelles_negatives.shape)

(22182, 3)


In [31]:
df_parelles = pd.concat([parelles_positives, parelles_negatives], ignore_index=True)
print(df_parelles.shape)

(33273, 3)


In [None]:
df_parelles.to_csv("parelles.csv")

In [33]:
df_blocks.head()

Unnamed: 0,type,content,article_id
0,theorem,The approximate peak time for a small fluoresc...,2502.01037
1,theorem,The approximate peak time for a large fluoresc...,2502.01037
2,definition,We define approximate peak time $t^{p}_0$ by t...,2502.01037
3,definition,Assume \eqref{ellcondition1} and \eqref{ellcon...,2502.01037
4,proof,We first study the asymptotic behavior of the ...,2502.01037
