# Data Preprocessing for Sentiment Analysis

This notebook focuses on the data cleaning and preprocessing steps for the sentiment analysis of Malay and English tweets. The goal is to prepare the dataset for further analysis and model training.

In [1]:
import pandas as pd
import re

# Load the dataset
data_path = '../data/raw/semisupervised-bert-xlnet.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,text,label,prob
0,@LionelModric10 Sabah sabah yorma donkey,Neutral,0.999414
1,Continue to reach out . . SEKUT in aja laah Na...,Neutral,0.994295
2,ada suprise untuk #danishnaufal_14 tq pada pem...,Positive,0.999538
3,aku kerja keras gila jimat jimat nak beli apa ...,Positive,0.999405
4,@farhanalv ajak makan ah ke penang bistro wkwkw,Neutral,0.999462


In [2]:
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the text column
df['cleaned_text'] = df['text'].apply(clean_text)

# Display the cleaned dataset
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,@LionelModric10 Sabah sabah yorma donkey,sabah sabah yorma donkey
1,Continue to reach out . . SEKUT in aja laah Na...,continue to reach out sekut in aja laah nant...
2,ada suprise untuk #danishnaufal_14 tq pada pem...,ada suprise untuk tq pada pemperi alhamdulill...
3,aku kerja keras gila jimat jimat nak beli apa ...,aku kerja keras gila jimat jimat nak beli apa ...
4,@farhanalv ajak makan ah ke penang bistro wkwkw,ajak makan ah ke penang bistro wkwkw


In [3]:
df.head()

Unnamed: 0,text,label,prob,cleaned_text
0,@LionelModric10 Sabah sabah yorma donkey,Neutral,0.999414,sabah sabah yorma donkey
1,Continue to reach out . . SEKUT in aja laah Na...,Neutral,0.994295,continue to reach out sekut in aja laah nant...
2,ada suprise untuk #danishnaufal_14 tq pada pem...,Positive,0.999538,ada suprise untuk tq pada pemperi alhamdulill...
3,aku kerja keras gila jimat jimat nak beli apa ...,Positive,0.999405,aku kerja keras gila jimat jimat nak beli apa ...
4,@farhanalv ajak makan ah ke penang bistro wkwkw,Neutral,0.999462,ajak makan ah ke penang bistro wkwkw


In [6]:
# Apply the cleaning function to the text column
df['cleaned_text'] = df['text'].apply(clean_text)

# Replace the 'text' column with the cleaned text
df['text'] = df['cleaned_text']

# Display the cleaned dataset
df[['text', 'cleaned_text']].head()

# Drop the 'cleaned_text' column as it's no longer needed
df.drop(columns=['cleaned_text'], inplace=True)

In [7]:
df.head()

Unnamed: 0,text,label,prob
0,sabah sabah yorma donkey,Neutral,0.999414
1,continue to reach out sekut in aja laah nant...,Neutral,0.994295
2,ada suprise untuk tq pada pemperi alhamdulill...,Positive,0.999538
3,aku kerja keras gila jimat jimat nak beli apa ...,Positive,0.999405
4,ajak makan ah ke penang bistro wkwkw,Neutral,0.999462


In [9]:
# Save the cleaned dataset to the processed directory
processed_data_path = '../data/processed/cleaned_data.csv'
df.to_csv(processed_data_path, index=False)
print('Cleaned data saved to:', processed_data_path)

Cleaned data saved to: ../data/processed/cleaned_data.csv
