In [1]:


# -*- coding: utf-8 -*-

import os
import re

import chardet
import joblib
import mrmr  # Pure-Python mRMR selection :contentReference[oaicite:1]{index=1}
import nlpaug.augmenter.word as naw  # Contextual augmentation :contentReference[oaicite:0]{index=0}
import pandas as pd
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [2]:
# import nltk
# nltk.download('wordnet')

In [3]:




# =============================================================================
# 1. Robust File Loader (encoding fallbacks)
# =============================================================================
def read_text_file(path: str) -> str:
    """Try multiple encodings, then chardet, always return str."""
    for enc in ('utf-8', 'utf-8-sig', 'latin-1', 'cp1252'):
        try:
            with open(path, 'r', encoding=enc) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    raw = open(path, 'rb').read()
    guess = chardet.detect(raw)
    enc = guess.get('encoding') or 'latin-1'
    try:
        return raw.decode(enc, errors='replace')
    except (LookupError, UnicodeDecodeError):
        return raw.decode('latin-1', errors='replace')


# Load the eight-topic Vietnamese corpus
base_dir = './Train_Full'
records = []
for topic in os.listdir(base_dir):
    topic_path = os.path.join(base_dir, topic)
    if os.path.isdir(topic_path):
        for fname in os.listdir(topic_path):
            if fname.lower().endswith('.txt'):
                txt = read_text_file(os.path.join(topic_path, fname))
                if txt:
                    records.append({'text': txt, 'label': topic})

df = pd.DataFrame(records)
print(f"Loaded {len(df)} documents across {df['label'].nunique()} topics.")


# =============================================================================
# 2. Preprocessing & Tokenization (named functions)
# =============================================================================
# Load Vietnamese stopwords
with open('vietnamese-stopwords.txt', encoding='utf-8') as f:
    VI_STOPWORDS = set(w.strip() for w in f if w.strip())

# Regexes for noise removal
EMOJI_RE    = re.compile("[\U0001F600-\U0001F64F"
                         "\U0001F300-\U0001F5FF"
                         "\U0001F680-\U0001F6FF"
                         "\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
REPEAT_PUNC = re.compile(r'([!?.,])\1+')
CONTROL_RE  = re.compile(r'[\x00-\x1F\x7F]+')


def clean_text(text: str) -> str:
    """Unicode normalize, lowercase, strip URLs/emails/digits/punc, remove emojis/control."""
    text = unicodedata.normalize('NFC', text).lower()
    text = re.sub(r'http\S+|www\.\S+|\S+@\S+|\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = EMOJI_RE.sub(' ', text)
    text = REPEAT_PUNC.sub(r'\1', text)
    text = CONTROL_RE.sub(' ', text)
    return re.sub(r'\s+', ' ', text).strip()


def tokenize(text: str) -> list[str]:
    """Split on whitespace and remove Vietnamese stopwords."""
    toks = text.split()
    return [t for t in toks if t not in VI_STOPWORDS]


# Apply cleaning + tokenization
df['clean']  = df['text'].apply(clean_text)
df['tokens'] = df['clean'].apply(tokenize)


# =============================================================================
# 3. Encode Labels & Train/Test Split
# =============================================================================
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
print("Label→ID mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

X = df['tokens']
y = df['label_id']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


# =============================================================================
# 4. Contextual Synonym Augmentation (multilingual BERT)
# =============================================================================
syn_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-multilingual-cased',  # Vietnamese support :contentReference[oaicite:2]{index=2}
    action='substitute',  # token substitution :contentReference[oaicite:3]{index=3}
    top_k=5,              # candidate pool :contentReference[oaicite:4]{index=4}
    aug_p=0.3,            # 30% tokens per sentence :contentReference[oaicite:5]{index=5}
    aug_min=1,            # at least 1 token :contentReference[oaicite:6]{index=6}
    aug_max=3,            # at most 3 tokens :contentReference[oaicite:7]{index=7}
    device='cuda'          # or 'cuda' :contentReference[oaicite:8]{index=8}
)

aug_texts, aug_labels = [], []
for toks, lbl in zip(X_train, y_train):
    out = syn_aug.augment(' '.join(toks), n=1)  # returns list of strings
    for sent in out:
        aug_texts.append(sent.split())
        aug_labels.append(lbl)

X_train = list(X_train) + aug_texts
y_train = list(y_train) + aug_labels
print("After augmentation, train size:", len(X_train))


# =============================================================================
# 5. TF–IDF & mRMR Feature Selection
# =============================================================================
# 5.1 Named identity functions for vectorizer
def identity_preprocessor(x): return x
def identity_tokenizer(x):    return x

vectorizer = TfidfVectorizer(
    preprocessor=identity_preprocessor,
    tokenizer=identity_tokenizer,
    token_pattern=None,
    ngram_range=(1,2),
    max_df=0.9,
    min_df=5
)

X_train_tfidf = vectorizer.fit_transform([' '.join(t) for t in X_train])
X_test_tfidf  = vectorizer.transform([' '.join(t) for t in X_test])
print("TF–IDF features:", X_train_tfidf.shape[1])  #

# 5.2 mRMR via pure-Python mrmr_selection
feat_df = pd.DataFrame(
    X_train_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)
selected = mrmr.mrmr_classif(
    X=feat_df,
    y=pd.Series(y_train, name='target'),
    K=2000
)
idx = [vectorizer.vocabulary_[f] for f in selected]
X_train_sel = X_train_tfidf[:, idx]
X_test_sel  = X_test_tfidf[:, idx]
print("mRMR selected features:", len(selected))  # :contentReference[oaicite:11]{index=11}


# =============================================================================
# 6. Pipeline Assembly & Training
# =============================================================================
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        preprocessor=clean_text,
        tokenizer=tokenize,
        token_pattern=None,
        ngram_range=(1,2),
        max_df=0.9,
        min_df=5
    )),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit([' '.join(t) for t in X_train], y_train)


# =============================================================================
# 7. Save Pipeline & Artifacts (no lambdas!)
# =============================================================================
os.makedirs('output', exist_ok=True)

# Only save the pipeline (it contains our named functions)
joblib.dump(pipeline, 'output/vi_text_pipeline.joblib')
print("✅ Pipeline saved to output/vi_text_pipeline.joblib")

# Save selected feature names (plain list)
joblib.dump(selected, 'output/mrmr_selected_features.joblib')
print("✅ mRMR features saved to output/mrmr_selected_features.joblib")


# =============================================================================
# 8. Evaluation
# =============================================================================
y_pred = pipeline.predict([' '.join(t) for t in X_test])
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))



Loaded 42744 documents across 8 topics.
Label→ID mapping: {'Chinh tri Xa hoi': np.int64(0), 'Doi song': np.int64(1), 'Kinh doanh': np.int64(2), 'Phap luat': np.int64(3), 'Suc khoe': np.int64(4), 'The gioi': np.int64(5), 'The thao': np.int64(6), 'Van hoa': np.int64(7)}
Train size: 34195, Test size: 8549
After augmentation, train size: 68390
TF–IDF features: 154


100%|██████████| 154/154 [00:40<00:00,  3.81it/s]


mRMR selected features: 154
✅ Pipeline saved to output/vi_text_pipeline.joblib
✅ mRMR features saved to output/mrmr_selected_features.joblib
Test Accuracy: 0.898935548017312
                  precision    recall  f1-score   support

Chinh tri Xa hoi       0.83      0.84      0.83      1314
        Doi song       0.85      0.84      0.84       839
      Kinh doanh       0.88      0.89      0.88       855
       Phap luat       0.92      0.91      0.91      1331
        Suc khoe       0.92      0.91      0.92       883
        The gioi       0.92      0.93      0.93      1143
        The thao       0.98      0.96      0.97      1134
         Van hoa       0.89      0.90      0.90      1050

        accuracy                           0.90      8549
       macro avg       0.90      0.90      0.90      8549
    weighted avg       0.90      0.90      0.90      8549

