# Data Preprocessing for Sentiment Analysis

This notebook processes raw data for both Vietnamese and English sentiment analysis.

In [1]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.config import Config
from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor

In [2]:
# Initialize configuration and objects
config = Config()
data_loader = DataLoader(config)

# Create processed data directory if it doesn't exist
processed_dir = os.path.join(config.DATA_DIR, 'processed')
os.makedirs(processed_dir, exist_ok=True)

## Process Vietnamese Data

In [3]:
# Load and process Vietnamese data
vi_preprocessor = DataPreprocessor('vi', config)

# Load raw Vietnamese data
vi_data = data_loader.load_data('vi')
print(f"Loaded Vietnamese data shape: {vi_data.shape}")

# Preprocess Vietnamese data
vi_processed = vi_preprocessor.preprocess(vi_data)
print(f"Processed Vietnamese data shape: {vi_processed.shape}")

# Save processed Vietnamese data
output_path = os.path.join(processed_dir, 'vi_processed_data.csv')
vi_preprocessor.save_processed_data(vi_processed, output_path)

2024-11-11 05:39:23,069 - src.data.data_loader - INFO - Valid samples after validation: 9722
Loaded Vietnamese data shape: (9722, 2)
2024-11-11 05:39:23,073 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 05:39:28,827 - src.data.preprocessor - INFO - Preprocessed 9673 valid samples
Processed Vietnamese data shape: (9673, 3)
2024-11-11 05:39:28,859 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\vi_processed_data.csv


## Process English Data

In [4]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')
nltk.download('punkt_tab')

# Load and process English data
en_preprocessor = DataPreprocessor('en', config)

# Load raw English data
en_data = data_loader.load_data('en')
print(f"Loaded English data shape: {en_data.shape}")

# Preprocess English data
en_processed = en_preprocessor.preprocess(en_data)
print(f"Processed English data shape: {en_processed.shape}")

# Save processed English data
output_path = os.path.join(processed_dir, 'en_processed_data.csv')
en_preprocessor.save_processed_data(en_processed, output_path)

2024-11-11 05:39:29,134 - src.data.data_loader - INFO - Valid samples after validation: 9787
Loaded English data shape: (9787, 2)
2024-11-11 05:39:29,136 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 05:39:29,136 - src.data.preprocessor - INFO - Preprocessing en data...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


2024-11-11 05:39:31,431 - src.data.preprocessor - INFO - Preprocessed 9710 valid samples
2024-11-11 05:39:31,431 - src.data.preprocessor - INFO - Preprocessed 9710 valid samples
Processed English data shape: (9710, 3)
2024-11-11 05:39:31,462 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\en_processed_data.csv
2024-11-11 05:39:31,462 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\en_processed_data.csv


## Data Quality Check

In [5]:
# Check processed data
def check_processed_data(df, language):
    print(f"\nQuality check for {language} data:")
    print(f"Number of samples: {len(df)}")
    print(f"Number of empty texts: {df['cleaned_text'].isna().sum()}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    print("\nSample processed texts:")
    print(df[['text', 'cleaned_text']].head())

check_processed_data(vi_processed, 'Vietnamese')
check_processed_data(en_processed, 'English')


Quality check for Vietnamese data:
Number of samples: 9673
Number of empty texts: 0
Label distribution:
label
2    4176
1    2925
0    2572
Name: count, dtype: int64

Sample processed texts:
                                                text  \
0           Mình rất sướngg rớt nước miếng với...!!!   
1    Ra rạp coi, Kịch bản , thấy rat worth?! 👎.. hem   
2  Quán chuyên nghiệp chu đáo, ấm cúng, không có ...   
3               không có gì đặc biệt, dùng được, 💭?!   
4  Xem trên Netflix, Đánggg xem nha mọi người, sẽ...   

                                        cleaned_text  
0                 mình rất sướngg rớt nước miếng với  
1             ra rạp coi kịch bản thấy rat worth hem  
2  quán chuyên nghiệp chu đáo ấm cúng không gì để...  
3                             không gì đặc biệt dùng  
4  xem trên netflix đánggg xem nha mọi người sẽ q...  

Quality check for English data:
Number of samples: 9710
Number of empty texts: 0
Label distribution:
label
2    4219
1    2844
0    2647
Nam

## Process Manual Data

In [6]:
# Load and process manual data for both languages
for lang in ['vi', 'en']:
    # Initialize preprocessor
    preprocessor = DataPreprocessor(lang, config)
    
    # Load manual data
    manual_data = data_loader.load_manual_data(lang)
    print(f"\nLoaded {lang} manual data shape: {manual_data.shape}")
    
    # Preprocess manual data
    manual_processed = preprocessor.preprocess(manual_data)
    print(f"Processed {lang} manual data shape: {manual_processed.shape}")
    
    # Save processed manual data
    output_path = os.path.join(processed_dir, f'{lang}_manual_processed.csv')
    preprocessor.save_processed_data(manual_processed, output_path)
    
    # Quality check
    check_processed_data(manual_processed, f'{lang.upper()} Manual')


Loaded vi manual data shape: (0, 2)
2024-11-11 05:39:31,518 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 05:39:31,518 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 05:39:31,518 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 05:39:31,520 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-11-11 05:39:31,520 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-11-11 05:39:31,520 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
Processed vi manual data shape: (0, 3)
2024-11-11 05:39:31,526 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\vi_manual_processed.csv
2024-11-11 05:39:31,526 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\vi_manual_processed.csv
2024-11-11 0