# Data Preprocessing for Sentiment Analysis

This notebook processes raw data for both Vietnamese and English sentiment analysis.

In [16]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.config import Config
from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor

In [17]:
# Initialize configuration and objects
config = Config()
data_loader = DataLoader(config)

# Create processed data directory if it doesn't exist
processed_dir = os.path.join(config.DATA_DIR, 'processed')
os.makedirs(processed_dir, exist_ok=True)

## Process Vietnamese Data

In [18]:
# Load and process Vietnamese data
vi_preprocessor = DataPreprocessor('vi', config)

# Load raw Vietnamese data
vi_data = data_loader.load_data('vi')
print(f"Loaded Vietnamese data shape: {vi_data.shape}")

# Preprocess Vietnamese data
vi_processed = vi_preprocessor.preprocess(vi_data)
print(f"Processed Vietnamese data shape: {vi_processed.shape}")

# Save processed Vietnamese data
output_path = os.path.join(processed_dir, 'vi_processed_data.csv')
vi_preprocessor.save_processed_data(vi_processed, output_path)

Loaded Vietnamese data shape: (5000, 2)
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:38,109 - src.data.preprocessor - INFO - Preprocessing vi data...
Processed Vietnamese data shape: (5000, 3)
2024-11-11 02:44:41,226 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaism

## Process English Data

In [19]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')
nltk.download('punkt_tab')

# Load and process English data
en_preprocessor = DataPreprocessor('en', config)

# Load raw English data
en_data = data_loader.load_data('en')
print(f"Loaded English data shape: {en_data.shape}")

# Preprocess English data
en_processed = en_preprocessor.preprocess(en_data)
print(f"Processed English data shape: {en_processed.shape}")

# Save processed English data
output_path = os.path.join(processed_dir, 'en_processed_data.csv')
en_preprocessor.save_processed_data(en_processed, output_path)

Loaded English data shape: (15, 2)
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 02:44:41,250 - src.data.preprocessor - INFO - Preprocessing en data...
Processed English data shape: (15, 3)
2024-11-11 02:44:41,272 -

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Data Quality Check

In [20]:
# Check processed data
def check_processed_data(df, language):
    print(f"\nQuality check for {language} data:")
    print(f"Number of samples: {len(df)}")
    print(f"Number of empty texts: {df['cleaned_text'].isna().sum()}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    print("\nSample processed texts:")
    print(df[['text', 'cleaned_text']].head())

check_processed_data(vi_processed, 'Vietnamese')
check_processed_data(en_processed, 'English')


Quality check for Vietnamese data:
Number of samples: 5000
Number of empty texts: 0
Label distribution:
Series([], Name: count, dtype: int64)

Sample processed texts:
                                              text  \
0      ✨ tốt hơn mong đợi, thấy rất worth!!!!?!!?!   
1  đét 🥰 tốt hơn mong đợi, thấy rất worth...!!!!!!   
2           quá tệ, không được tốt, z chán thật sự   
3           😢 Chất lượng , cần cải thiện thêm 😕...   
4                   đét Hơii thất vọng về , 😕..>.<   

                          cleaned_text  
0      tốt hơn mong đợi thấy rất worth  
1  đét tốt hơn mong đợi thấy rất worth  
2      quá tệ không tốt z chán thật sự  
3        chất lượng cần cải thiện thêm  
4                đét hơii thất vọng về  

Quality check for English data:
Number of samples: 15
Number of empty texts: 0
Label distribution:
label
1    7
0    6
2    2
Name: count, dtype: int64

Sample processed texts:
                                      text                        cleaned_text
0   

## Process Manual Data

In [21]:
# Load and process manual data for both languages
for lang in ['vi', 'en']:
    # Initialize preprocessor
    preprocessor = DataPreprocessor(lang, config)
    
    # Load manual data
    manual_data = data_loader.load_manual_data(lang)
    print(f"\nLoaded {lang} manual data shape: {manual_data.shape}")
    
    # Preprocess manual data
    manual_processed = preprocessor.preprocess(manual_data)
    print(f"Processed {lang} manual data shape: {manual_processed.shape}")
    
    # Save processed manual data
    output_path = os.path.join(processed_dir, f'{lang}_manual_processed.csv')
    preprocessor.save_processed_data(manual_processed, output_path)
    
    # Quality check
    check_processed_data(manual_processed, f'{lang.upper()} Manual')


Loaded vi manual data shape: (0, 2)
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 02:44:41,327 - src.data.preprocessor - INFO - Prep