In [19]:
#reader
from typing import List

def read_file(path: str) -> List[str]:
    """
    Docstring para read_file
    
    :param path: Path to the input file
    :type path: str
    :return: List of lines read from teh file
    :rtype: List[str]
    """

    try:
        with open(path, 'r') as file:
            return file.readlines()
    except FileNotFoundError:
        print('[ERROR] File not found', path)
        return[]

In [20]:
read_file('data/data.txt')

['Juan,25,Lima\n',
 'Maria,17,Arequipa\n',
 'Carlos,30,\n',
 'Ana,abc,Trujillo\n',
 'Luis,45,Cusco\n',
 'Sofia,22,Lima\n',
 'Pedro,19,Piura\n',
 'Lucia,18,Chiclayo\n',
 'Miguel,52,Lima\n',
 'Rosa,27,Arequipa\n',
 'Jorge,15,Cusco\n',
 'Elena,33,Trujillo\n',
 'Diego,29,Lima\n',
 'Valeria,21,\n',
 'Andres,40,Piura\n',
 'Patricia,abc,Lima\n',
 'Ricardo,38,Arequipa\n',
 'Camila,23,Chiclayo\n',
 'Fernando,50,Lima\n',
 'Gabriela,20,Cusco']

In [28]:
#validator
def validate_line(line: str) -> bool:
    """
    Validates if a line has the correct structure and valid data.
    Expected format: name, age, city

    Conditions:
    - Must hace exactly 3 valores
    - Age must be numeric
    - Age must be >=18
    - City must not be empty
    """

    #Gabriela,20,Cusco
    parts = line.strip().split(',') # -> retorna una lista ['Gabriela', 20, 'Cusco']

    if len(parts) !=3:
        return False
    name, age, city = parts

    if not age.isdigit():
        return False
    
    if int(age) < 18:
        return False
    
    if not city.strip():
        return False

    return True

In [29]:
# transformar
from typing import Dict
def clean_text(text: str) ->str:
    """
    Cleans text by removing extra spaces
    """
    return text.strip().title()

def transform_line(line: str) -> Dict[str, object]:
    """
    Transforms a valid line into a structured dictionary
    """

    try:
        name, age, city = line.strip().split(',')
        return {
            "name": clean_text(name).upper(),
            "age": int(age),
            "city": clean_text(city)
        }
    except Exception as e:
        print('[ERROR]: ', line)

In [30]:
transform_line('Gabriela,20,Cusco')

{'name': 'GABRIELA', 'age': 20, 'city': 'Cusco'}

In [31]:
#writer

from typing import List, Dict

def write_file(data: List[Dict[str, object]], path: str) -> None:
    """  
    Writes cleaned data into a new file
    """

    with open(path, 'w') as file:
        line = f"Name, Age, City\n"
        file.write(line)
        for item in data:
            line = f"{item['name']}, {item['age']}, {item['city']}\n"
            file.write(line)

In [32]:
#data = [{'name': 'GABRIELA', 'age': 20, 'city': 'Cusco'}]
write_file(data, 'data/data_clean.txt')

In [33]:
# main
def process_data(path: str) ->List[Dict[str, object]]:
    """
    Orchestrates the ETL process:
    Read -> Validate -> Transform
    """
    lines = read_file(path)
    clean_data = []

    for idx, line in enumerate(lines, start=1):
        if validate_line(line):
            person = transform_line(line)
            if person:
                clean_data.append(person)
        else:
            print(f"[INVALID LINE {idx}] {line}")
    return clean_data


In [34]:
data = process_data('data/data.txt')
print(data)

[INVALID LINE 2] Maria,17,Arequipa

[INVALID LINE 3] Carlos,30,

[INVALID LINE 4] Ana,abc,Trujillo

[INVALID LINE 11] Jorge,15,Cusco

[INVALID LINE 14] Valeria,21,

[INVALID LINE 16] Patricia,abc,Lima

[{'name': 'JUAN', 'age': 25, 'city': 'Lima'}, {'name': 'LUIS', 'age': 45, 'city': 'Cusco'}, {'name': 'SOFIA', 'age': 22, 'city': 'Lima'}, {'name': 'PEDRO', 'age': 19, 'city': 'Piura'}, {'name': 'LUCIA', 'age': 18, 'city': 'Chiclayo'}, {'name': 'MIGUEL', 'age': 52, 'city': 'Lima'}, {'name': 'ROSA', 'age': 27, 'city': 'Arequipa'}, {'name': 'ELENA', 'age': 33, 'city': 'Trujillo'}, {'name': 'DIEGO', 'age': 29, 'city': 'Lima'}, {'name': 'ANDRES', 'age': 40, 'city': 'Piura'}, {'name': 'RICARDO', 'age': 38, 'city': 'Arequipa'}, {'name': 'CAMILA', 'age': 23, 'city': 'Chiclayo'}, {'name': 'FERNANDO', 'age': 50, 'city': 'Lima'}, {'name': 'GABRIELA', 'age': 20, 'city': 'Cusco'}]


In [35]:
write_file(data, 'data/data_clean.txt')