---

# Lib

In [22]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer

from preprocessing.clean import vn_text_clean
from preprocessing.tokenize import vn_word_tokenize
from preprocessing.remove_stopwords import remove_stopwords
from utils import parse_label, matrix_labels

---

# Read file

In [23]:
# Train
df_train = pd.read_csv("../data/raw/train.csv")
df_train.columns = df_train.columns.str.strip().str.lower().str.replace(' ', '_')

In [24]:
# Validation
df_val = pd.read_csv("../data/raw/val.csv")
df_val.columns = df_val.columns.str.strip().str.lower().str.replace(' ', '_')

In [25]:
df_train = df_train[["comment", "label"]].copy()
df_val = df_val[["comment", "label"]].copy()

In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1403 non-null   object
 1   label    1403 non-null   object
dtypes: object(2)
memory usage: 22.0+ KB


In [27]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  500 non-null    object
 1   label    500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


---

# Clean text

In [28]:
df_train["comment"] = df_train["comment"].apply(lambda x: vn_text_clean(x))
df_val["comment"] = df_val["comment"].apply(lambda x: vn_text_clean(x))

In [29]:
df_train.iloc[0, 0]

'đuôi dạng coupe nhìn đẹp hẳn'

In [30]:
df_val.iloc[0,0]

'nhìn cái mông ok rồi đấy'

---

# Tokenize

In [31]:
df_train["comment"] = df_train["comment"].apply(vn_word_tokenize, method="underthesea")
df_val["comment"] = df_val["comment"].apply(vn_word_tokenize, method="underthesea")

In [32]:
df_train.iloc[0, 0]

'đuôi dạng coupe nhìn đẹp hẳn'

In [33]:
df_val.iloc[0, 0]

'nhìn cái mông ok rồi đấy'

---

# Remove stopwords

In [34]:
# Backup comment for remove stopwords
df_train["comment_backup"] = df_train["comment"].copy()
df_val["comment_backup"] = df_val["comment"].copy()

In [35]:
def remove_stopwords_wrapper(df: pd.DataFrame) -> pd.DataFrame:
    # Remove stopwords with fallback if result becomes empty/NaN
    post = df["comment"].apply(remove_stopwords)

    # Normalize empties to NaN to enable fallback
    post = post.replace("", np.nan)

    print(post.isnull().sum())

    # Fallback to backup text where stopword removal produced NaN
    fallback = post.fillna(df["comment_backup"])

    df["comment"] = fallback
    
    return df

In [36]:
df_train = remove_stopwords_wrapper(df_train)
df_val = remove_stopwords_wrapper(df_val)

6
0


In [37]:
df_train.iloc[0, 0]

'đuôi dạng coupe đẹp hẳn'

In [38]:
df_val.iloc[0, 0]

'mông ok đấy'

In [39]:
# Remove backup columns
df_train.drop(columns="comment_backup", axis=1, inplace=True)
df_val.drop(columns="comment_backup", axis=1, inplace=True)

---

# Save file

In [40]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1403 non-null   object
 1   label    1403 non-null   object
dtypes: object(2)
memory usage: 22.0+ KB


In [41]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  500 non-null    object
 1   label    500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [42]:
df_train.to_csv("../data/processed/train.csv", index=False)
df_val.to_csv("../data/processed/val.csv", index=False)