# Data Preprocessing for Sentiment Analysis

This notebook focuses on the data cleaning and preprocessing steps for the sentiment analysis of Malay and English tweets. The goal is to prepare the dataset for further analysis and model training.

In [None]:
import pandas as pd
import re

# Load the dataset
data_path = '../data/raw/semisupervised-bert-xlnet.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

In [None]:
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the text column
df['cleaned_text'] = df['text'].apply(clean_text)

# Display the cleaned dataset
df[['text', 'cleaned_text']].head()

In [None]:
# Save the cleaned dataset to the processed directory
processed_data_path = '../data/processed/cleaned_data.csv'
df.to_csv(processed_data_path, index=False)
print('Cleaned data saved to:', processed_data_path)