---

# Lib

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer

from preprocessing.clean import vn_text_clean
from preprocessing.tokenize import vn_word_tokenize
from preprocessing.remove_stopwords import remove_stopwords

[nltk_data] Downloading package punkt to C:\Users\DANG
[nltk_data]     CUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---

# Read file

In [2]:
# Train
df_train = pd.read_csv("../data/raw/train.csv")
df_train.columns = df_train.columns.str.strip().str.lower().str.replace(' ', '_')
# Validation
df_val = pd.read_csv("../data/raw/val.csv")
df_val.columns = df_val.columns.str.strip().str.lower().str.replace(' ', '_')
# Test
df_test = pd.read_csv("../data/raw/test.csv")
df_test.columns = df_test.columns.str.strip().str.lower().str.replace(' ', '_')

In [3]:
df_train = df_train[["comment", "label"]].copy()
df_val = df_val[["comment", "label"]].copy()
df_test = df_test[["comment", "label"]].copy()

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3592 entries, 0 to 3591
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  3592 non-null   object
 1   label    3592 non-null   object
dtypes: object(2)
memory usage: 56.2+ KB


In [5]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  770 non-null    object
 1   label    770 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  770 non-null    object
 1   label    770 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB


---

# Clean text

In [7]:
df_train["comment"] = df_train["comment"].apply(lambda x: vn_text_clean(x))
df_val["comment"] = df_val["comment"].apply(lambda x: vn_text_clean(x))
df_test["comment"] = df_test["comment"].apply(lambda x: vn_text_clean(x))

In [8]:
df_train.iloc[0, 0]

'giá nó còn quá cao so với thu nhập người việt hy vọng tương lại hãng giảm xún cho bản thấp là đẹp'

In [9]:
df_val.iloc[0,0]

'chạy ngoài đường thấy chiếc byd đẹp thật'

In [10]:
df_test.iloc[0,0]

'ngoại thất mới nhìn hiện đại hơn hẳn'

---

# Tokenize

In [11]:
df_train["comment"] = df_train["comment"].apply(vn_word_tokenize, method="underthesea")
df_val["comment"] = df_val["comment"].apply(vn_word_tokenize, method="underthesea")
df_test["comment"] = df_test["comment"].apply(vn_word_tokenize, method="underthesea")

In [12]:
df_train.iloc[0, 0]

'giá nó còn quá cao so với thu_nhập người việt hy_vọng tương_lại hãng giảm xún cho bản thấp là đẹp'

In [13]:
df_val.iloc[0, 0]

'chạy ngoài đường thấy chiếc byd đẹp thật'

In [14]:
df_test.iloc[0,0]

'ngoại thất mới nhìn hiện_đại hơn hẳn'

---

# Remove stopwords

In [15]:
# Backup text for remove stopwords
df_train["text_backup"] = df_train["comment"].copy()
df_val["text_backup"] = df_val["comment"].copy()
df_test["text_backup"] = df_test["comment"].copy()

In [16]:
def remove_stopwords_wrapper(df: pd.DataFrame) -> pd.DataFrame:
    # Remove stopwords with fallback if result becomes empty/NaN
    post = df["comment"].apply(remove_stopwords)

    # Normalize empties to NaN to enable fallback
    post = post.replace("", np.nan)

    print(post.isnull().sum())

    # Fallback to backup text where stopword removal produced NaN
    fallback = post.fillna(df["text_backup"])

    df["comment"] = fallback
    
    return df

In [17]:
df_train = remove_stopwords_wrapper(df_train)
df_val = remove_stopwords_wrapper(df_val)
df_test = remove_stopwords_wrapper(df_test)

9
3
1


In [18]:
df_train.iloc[0, 0]

'giá thu_nhập việt hy_vọng tương_lại hãng xún đẹp'

In [19]:
df_val.iloc[0, 0]

'chạy đường byd đẹp'

In [20]:
df_test.iloc[0,0]

'ngoại thất hiện_đại hẳn'

In [21]:
# Remove backup columns
df_train.drop(columns="text_backup", axis=1, inplace=True)
df_val.drop(columns="text_backup", axis=1, inplace=True)
df_test.drop(columns="text_backup", axis=1, inplace=True)

---

# Save file

In [22]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3592 entries, 0 to 3591
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  3592 non-null   object
 1   label    3592 non-null   object
dtypes: object(2)
memory usage: 56.2+ KB


In [23]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  770 non-null    object
 1   label    770 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB


In [24]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  770 non-null    object
 1   label    770 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB


In [25]:
df_train.to_csv("../data/processed/train.csv", index=False)
df_val.to_csv("../data/processed/val.csv", index=False)
df_test.to_csv("../data/processed/test.csv", index=False)