IMPORTAR TRANSCRIPCIONES

In [None]:
import os


%run /home/jovyan/work/scripts/crear_transcripciones.py

# print("Archivos en el directorio actual:")


LIMPIEZA DE DATOS

In [None]:
import os
import re
from bs4 import BeautifulSoup
from num2words import num2words

def clean_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\*.*?\*', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\d+', lambda x: num2words(int(x.group()), lang='es'), text)
    text = re.sub(r'[^A-Za-záéíóúüñÁÉÍÓÚÜÑ\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

input_directory = '/home/jovyan/work/data/correccion_gramatical'
output_directory = '/home/jovyan/work/data/cleaned'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text)
            with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as out_file:
                out_file.write(cleaned_text)


CORRECCION ORTOGRÁFICA Y GRAMATICAL

In [None]:
import os
import requests
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer

GPT = False

input_directory = '/home/jovyan/work/data/transcripciones'
output_directory = '/home/jovyan/work/data/correccion_gramatical'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def limit_text(text, max_length=250):
    return text[:max_length]

#Función corregir texto GPT
if GPT:
    model_name = 'gpt2-large'
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    def correct_text_gpt(text):
        inputs = tokenizer.encode("Correct this sentence: " + text, return_tensors="pt")
        outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
        corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return corrected_text

# Función  corregir texto LanguageTool
else:
    def correct_text_api(text, language='es', retries=3, backoff=2):
        url = 'https://api.languagetool.org/v2/check'
        data = {
            'text': text,
            'language': language
        }
        
        for attempt in range(retries):
            try:
                response = requests.post(url, data=data)
                response.raise_for_status()  
                return response.json()
            except requests.RequestException as e:
                print(f"Error en la solicitud a LanguageTool API: {e}. Intento {attempt + 1} de {retries}.")
                if attempt < retries - 1:
                    time.sleep(backoff ** attempt)  
                else:
                    raise

    # aplicar correcciones manteniendo la estructura del texto
    def apply_corrections(text, matches):
        corrections = []
        for match in matches['matches']:
            if match['replacements']:
                correction = {
                    'offset': match['offset'],
                    'length': match['length'],
                    'replacement': match['replacements'][0]['value']
                }
                corrections.append(correction)
        
        corrections.sort(key=lambda x: x['offset'], reverse=True)
        
        for correction in corrections:
            start = correction['offset']
            end = start + correction['length']
            text = text[:start] + correction['replacement'] + text[end:]
        
        return text

for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            text = limit_text(text)
            try:
                if GPT:
                    corrected_text = correct_text_gpt(text)
                else:
                    matches = correct_text_api(text)
                    corrected_text = apply_corrections(text, matches)

                with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as out_file:
                    out_file.write(corrected_text)
                print(f"Archivo '{filename}' corregido y guardado.")
            except Exception as e:
                print(f"No se pudo corregir el archivo '{filename}': {e}")

print("Corrección gramatical completada.")


TOKENIZACION

In [None]:
import os
import spacy

nlp = spacy.load("es_core_news_sm")

def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# input_directory = '/home/jovyan/work/data/corrected'
input_directory = '/home/jovyan/work/data/normalized'
output_directory = '/home/jovyan/work/data/tokenized'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = tokenize(text)
            with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as out_file:
                out_file.write(' '.join(tokens))


ANALISIS RESULTADOS

COMPARACIÓN TEXTOS ORIGINALES VS CORREGIDOS

In [None]:
import os

def read_chars_from_n_to_m(file_path, n, m):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        return text[n:m]

def compare_texts(original, corrected):
    print(f"Original: {original}")
    print(f"Corrected: {corrected}")
    print("=" * 50)

def compare_files(original_directory, corrected_directory, start_char, end_char):
    file_names = [f for f in os.listdir(original_directory) if f.endswith('.txt')]

    for file_name in file_names:
        original_path = os.path.join(original_directory, file_name)
        corrected_path = os.path.join(corrected_directory, file_name)
        
        if os.path.exists(corrected_path):
            original_text = read_chars_from_n_to_m(original_path, start_char, end_char)
            corrected_text = read_chars_from_n_to_m(corrected_path, start_char, end_char)
            
            compare_texts(original_text, corrected_text)
        else:
            print(f"Archivo corregido no encontrado para: {file_name}")

original_directory = '/home/jovyan/work/data/transcripciones'
corrected_directory = '/home/jovyan/work/data/tokenized'

start_char = 100
end_char = 350

compare_files(original_directory, corrected_directory, start_char, end_char)
