# Data Preprocessing for Sentiment Analysis

This notebook processes raw data for both Vietnamese and English sentiment analysis.

In [7]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.config import Config
from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor

In [8]:
# Initialize configuration and objects
config = Config()
data_loader = DataLoader(config)

# Create processed data directory if it doesn't exist
processed_dir = os.path.join(config.DATA_DIR, 'processed')
os.makedirs(processed_dir, exist_ok=True)

## Process Vietnamese Data

In [9]:
# Load and process Vietnamese data
vi_preprocessor = DataPreprocessor('vi', config)

# Load raw Vietnamese data
vi_data = data_loader.load_data('vi')
print(f"Loaded Vietnamese data shape: {vi_data.shape}")

# Preprocess Vietnamese data
vi_processed = vi_preprocessor.preprocess(vi_data)
print(f"Processed Vietnamese data shape: {vi_processed.shape}")

# Save processed Vietnamese data
output_path = os.path.join(processed_dir, 'vi_processed_data.csv')
vi_preprocessor.save_processed_data(vi_processed, output_path)

2024-11-11 04:37:56,660 - src.data.data_loader - INFO - Valid samples after validation: 10000
2024-11-11 04:37:56,660 - src.data.data_loader - INFO - Valid samples after validation: 10000
Loaded Vietnamese data shape: (10000, 2)
2024-11-11 04:37:56,663 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:37:56,663 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:37:56,663 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:37:56,663 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:37:56,663 - src.data.preprocessor - INFO - Preprocessing vi data...


2024-11-11 04:38:02,421 - src.data.preprocessor - INFO - Preprocessed 9958 valid samples
2024-11-11 04:38:02,421 - src.data.preprocessor - INFO - Preprocessed 9958 valid samples
2024-11-11 04:38:02,421 - src.data.preprocessor - INFO - Preprocessed 9958 valid samples
2024-11-11 04:38:02,421 - src.data.preprocessor - INFO - Preprocessed 9958 valid samples
2024-11-11 04:38:02,421 - src.data.preprocessor - INFO - Preprocessed 9958 valid samples
Processed Vietnamese data shape: (9958, 3)
2024-11-11 04:38:02,455 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\vi_processed_data.csv
2024-11-11 04:38:02,455 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentiment_analysis\data\processed\vi_processed_data.csv
2024-11-11 04:38:02,455 - src.data.preprocessor - INFO - Saved processed data to c:\Users\tamaisme\Desktop\Projects\PYTHON\PROJECT\sentime

## Process English Data

In [10]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')
nltk.download('punkt_tab')

# Load and process English data
en_preprocessor = DataPreprocessor('en', config)

# Load raw English data
en_data = data_loader.load_data('en')
print(f"Loaded English data shape: {en_data.shape}")

# Preprocess English data
en_processed = en_preprocessor.preprocess(en_data)
print(f"Processed English data shape: {en_processed.shape}")

# Save processed English data
output_path = os.path.join(processed_dir, 'en_processed_data.csv')
en_preprocessor.save_processed_data(en_processed, output_path)

2024-11-11 04:38:02,472 - src.data.data_loader - ERROR - Failed to load data: [Errno 2] No such file or directory: 'c:\\Users\\tamaisme\\Desktop\\Projects\\PYTHON\\PROJECT\\sentiment_analysis\\data\\raw\\en_social_media.csv'
2024-11-11 04:38:02,472 - src.data.data_loader - ERROR - Failed to load data: [Errno 2] No such file or directory: 'c:\\Users\\tamaisme\\Desktop\\Projects\\PYTHON\\PROJECT\\sentiment_analysis\\data\\raw\\en_social_media.csv'
Loaded English data shape: (0, 2)
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,476 - src.data.preprocessor - INFO - Preprocessing en data...
2024-11-11 04:38:02,481 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tamaisme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Data Quality Check

In [11]:
# Check processed data
def check_processed_data(df, language):
    print(f"\nQuality check for {language} data:")
    print(f"Number of samples: {len(df)}")
    print(f"Number of empty texts: {df['cleaned_text'].isna().sum()}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    print("\nSample processed texts:")
    print(df[['text', 'cleaned_text']].head())

check_processed_data(vi_processed, 'Vietnamese')
check_processed_data(en_processed, 'English')


Quality check for Vietnamese data:
Number of samples: 9958
Number of empty texts: 0
Label distribution:
label
2    4217
1    3043
0    2698
Name: count, dtype: int64

Sample processed texts:
                                                text  \
0                                    khong được tốt,   
1  Dùng được 1 tuần, tot hơn mong đợi, giá hơi ca...   
2    tot hơn mong đợi, đáng đồng tiền!!!...!!!...!!!   
3   xuất sắc ưng cái bụng, sản phẩm tốt quá,.. dk v~   
4  Mới mua, 😢 😞 Không đáng giá tiền,.. #review#pr...   

                                        cleaned_text  
0                                          khong tốt  
1  dùng 1 tuần tot hơn mong đợi giá hơi caoooo cự...  
2                    tot hơn mong đợi đáng đồng tiền  
3        xuất sắc ưng cái bụng sản phẩm tốt quá dk v  
4   mới mua không đáng giá tiền reviewproduct_review  

Quality check for English data:
Number of samples: 0
Number of empty texts: 0
Label distribution:
Series([], Name: count, dtype: int64)

Sam

## Process Manual Data

In [12]:
# Load and process manual data for both languages
for lang in ['vi', 'en']:
    # Initialize preprocessor
    preprocessor = DataPreprocessor(lang, config)
    
    # Load manual data
    manual_data = data_loader.load_manual_data(lang)
    print(f"\nLoaded {lang} manual data shape: {manual_data.shape}")
    
    # Preprocess manual data
    manual_processed = preprocessor.preprocess(manual_data)
    print(f"Processed {lang} manual data shape: {manual_processed.shape}")
    
    # Save processed manual data
    output_path = os.path.join(processed_dir, f'{lang}_manual_processed.csv')
    preprocessor.save_processed_data(manual_processed, output_path)
    
    # Quality check
    check_processed_data(manual_processed, f'{lang.upper()} Manual')


Loaded vi manual data shape: (0, 2)
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,537 - src.data.preprocessor - INFO - Preprocessing vi data...
2024-11-11 04:38:02,542 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-11-11 04:38:02,542 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-11-11 04:38:02,542 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-11-11 04:38:02,542 - src.data.preprocessor - ERROR - Preprocessing error: Empty input data
2024-