In [4]:
import os
import pandas as pd
import re

In [None]:
# Read and Load files from miscellaneous

folder_path = "./miscellaneous"
files = [f for f in os.listdir(folder_path) if f.startswith("results_raw")]

dataframes = [] # archive processed data by file

for file in files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        # Store content for later processing
        dataframes.append({"filename": file, "content": lines})

In [6]:
# Spread Header from Body
for df in dataframes:
    content = df["content"]
    header_start = content.index("HEADER\n")
    body_start = content.index("BODY\n")
    
    header_data = content[header_start + 1 : body_start] # slicing between HEADER and BODY
    body_data = content[body_start + 1 : ] # From BODY until the end of the file
    
    # Remove blank lines and extra spaces
    header_data = [line.strip() for line in header_data if line.strip()]
    body_data = [line.strip() for line in body_data if line.strip()]
    
    df["header"] = header_data
    df["body"] = body_data
    
print(dataframes)

[{'filename': 'results_raw_lottery_id_177_lista_del_sorteo_extraordinario_no._390.txt', 'content': ['HEADER\n', 'LISTA DEL SORTEO EXTRAORDINARIO NO. 390\n', '|PRIMER PREMIO 16780 ||| SEGUNDO PREMIO 46129 ||| TERCER PREMIO 77885|\n', 'REINTEGROS 0,9 ,5\n', 'FECHA DEL SORTEO: 08/06/2024 --- FECHA DE CADUCIDAD: 09/12/2024\n', 'VER LISTA DE COMBINACIONES\n', '\n', 'BODY\n', 'CENTENARES\n', '00044 P .... 600.00\n', '00080 DT .... 600.00\n', '00129 TT .... 1,000.00\n', '00180 DT .... 600.00\n', '00228 P .... 800.00\n', '00280 DT .... 600.00\n', '00380 DT .... 600.00\n', '00422 P .... 800.00\n', '00480 DT .... 600.00\n', '00580 DT .... 600.00\n', '00680 DT .... 600.00\n', '00780 TT .... 1,200.00\n', '00813 P .... 600.00\n', '00870 PR .... 950.00\n', '00876 P .... 15,000.00\n', 'VENDIDO POR MANUEL ROLANDO QUINILLA CHIVALAM, DE ANTIGUA GUATEMALA, SACATEPÉQUEZ\n', '00879 PR .... 950.00\n', '00880 DT .... 600.00\n', '00885 TT .... 900.00\n', '00980 DT .... 600.00\n', '\n', '00MIL\n', '01080 DT ..

In [9]:
# Process HEADER
sorteos_data = []
for df in dataframes:
    header = df["header"]
    numero_sorteo = re.search(r"NO. (\d+)", header[0]).group(1)
    fecha_sorteo = re.search(r"FECHA DEL SORTEO: ([\d/]+)", " ".join(header)).group(1)
    fecha_caducidad = re.search(r"FECHA DE CADUCIDAD: ([\d/]+)", " ".join(header)).group(1)
    premios = re.search(r"PRIMER PREMIO (\d+) \|\|\| SEGUNDO PREMIO (\d+) \|\|\| TERCER PREMIO (\d+)", " ".join(header))
    primer_premio, segundo_premio, tercer_premio = premios.groups()
    reintegros = re.search(r"REINTEGROS ([\d, ]+)", " ".join(header)).group(1).replace(" ", "")
    
    sorteos_data.append({
        "numero_sorteo": numero_sorteo,
        "fecha_sorteo": fecha_sorteo,
        "fecha_caducidad": fecha_caducidad,
        "primer_premio": primer_premio,
        "segundo_premio": segundo_premio,
        "tercer_premio": tercer_premio,
        "reintegros": reintegros
    })

sorteos_df = pd.DataFrame(sorteos_data)
sorteos_df

Unnamed: 0,numero_sorteo,fecha_sorteo,fecha_caducidad,primer_premio,segundo_premio,tercer_premio,reintegros
0,390,08/06/2024,09/12/2024,16780,46129,77885,95
1,3047,15/06/2024,16/12/2024,45100,74061,53248,18
2,3048,22/06/2024,23/12/2024,74569,47785,20555,955
3,3049,29/06/2024,30/12/2024,8241,29870,78927,107
4,3050,06/07/2024,06/01/2025,72487,10469,36580,790
5,3051,13/07/2024,13/01/2025,967,32102,41229,729
6,391,21/07/2024,21/01/2025,1095,96032,12592,522
7,3052,27/07/2024,27/01/2025,27844,54823,6733,433
