# IMPORTS

In [5]:
import os
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

# CONFIGURAÇÕES

In [None]:
BASE_FOLDER_TRAIN = "treino"

FILES = [
    "train_literal_dinamico.csv",
    "train_complexo_simples.csv",
    "train_arcaico_moderno.csv",
]

preprocess_params = {
    "lowercase": True,
    "normalize_unicode": False,
    "remove_extra_whitespace": True,
    "remove_punct": False,
}

# PRÉ-PROCESSAMENTO

In [10]:
def preprocess_operations(text, params):
    if not isinstance(text, str):
        return ""
    if params.get("normalize_unicode", True):
        text = unicodedata.normalize("NFKC", text)
    if params.get("lowercase", True):
        text = text.lower()
    if params.get("remove_punct", False):
        text = re.sub(r"[^\w\s]", " ", text)
    if params.get("remove_extra_whitespace", True):
        text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_data(path):
    if not os.path.exists(path):
        print(f"Aviso: {path} não encontrado.")
        return None

    df = pd.read_csv(path, sep=";")
    col_text, col_label = "text", "style"

    df = df[[col_text, col_label]].dropna()
    df = shuffle(df, random_state=10).reset_index(drop=True)

    df["text_preproc"] = df[col_text].apply(lambda x: preprocess_operations(x, preprocess_params))

    le = LabelEncoder()
    y = le.fit_transform(df[col_label])
    X = df["text_preproc"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, stratify=y, random_state=10
    )

    return X_train, X_test, y_train, y_test

In [11]:
datasets = {}

for file_name in FILES:
    path = os.path.join(BASE_FOLDER_TRAIN, file_name)
    print(f"\nProcessando: {file_name}")
    result = preprocess_data(path) 

    if result is not None:
        X_train, X_test, y_train, y_test = result
        datasets[file_name] = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }


Processando: train_literal_dinamico.csv

Processando: train_complexo_simples.csv

Processando: train_arcaico_moderno.csv
