In [1]:
import pandas as pd
import os
from datetime import datetime
import re

In [4]:
data1_path = '../rawsDataset/data1.tsv'
data2_path = '../rawsDataset/data_toxic.csv'
clean_dataset_dir = '../cleanDataset'

def check_path(path):
    if os.path.exists(path):
        if os.path.isfile(path):
            print(f"File ditemukan: {path}")
        elif os.path.isdir(path):
            print(f"Direktori ditemukan: {path}")
        else:
            print(f"Path ada, tapi bukan file atau direktori biasa: {path}")
    else:
        print(f"Path TIDAK ditemukan: {path}")

check_path(data1_path)
check_path(data2_path)
check_path(clean_dataset_dir)

File ditemukan: ../rawsDataset/data1.tsv
File ditemukan: ../rawsDataset/data_toxic.csv
Direktori ditemukan: ../cleanDataset


In [5]:
data1_df = pd.read_csv(data1_path, sep='\t')
print(f"Data1.tsv berhasil dibaca: {len(data1_df)} baris")

data2_df = pd.read_csv(data2_path)  
print(f"Data_toxic.csv berhasil dibaca: {len(data2_df)} baris")

Data1.tsv berhasil dibaca: 1403 baris
Data_toxic.csv berhasil dibaca: 778 baris


In [6]:
data1_df['toxic'] = data1_df['label'].map({'CB': 1, 'Non_CB': 0})
data1_processed = data1_df.rename(columns={'text': 'comments'})[['comments', 'toxic']].copy()

print(f"Total baris: {len(data1_processed)}")
print(f"CB (toxic=1): {data1_processed['toxic'].sum()}")

Total baris: 1403
CB (toxic=1): 777


In [7]:
data2_processed = data2_df.rename(columns={'processed_text': 'comments'})

print(f"Total baris: {len(data2_processed)}")
print(f"Toxic (1): {data2_processed['toxic'].sum()}")

Total baris: 778
Toxic (1): 432


In [45]:
merged_df = pd.concat([data1_processed, data2_processed], ignore_index=True)
before = len(merged_df)

merged_df = merged_df.drop_duplicates(subset=['comments'])
after = len(merged_df)

print(f"Duplikat dihapus: {before - after}")
print(f"Total baris akhir: {after}")

Duplikat dihapus: 113
Total baris akhir: 2068


In [49]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.strip()

merged_df['comments'] = merged_df['comments'].apply(preprocess_text)

empty_count = (merged_df['comments'].str.len() == 0).sum()
print(f"\nEmpty comments after preprocessing: {empty_count}")



Empty comments after preprocessing: 0


In [51]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"clean_{timestamp}.csv"
output_path = os.path.join(clean_dataset_dir, output_filename)

merged_df.to_csv(output_path, index=False, encoding='utf-8')
print("File berhasil disimpan")
print(f"Total komentar: {len(merged_df)}")
print(f"Toxic: {merged_df['toxic'].sum()}")
print(f"Non-toxic: {len(merged_df) - merged_df['toxic'].sum()}")


File berhasil disimpan
- Total komentar: 2068
- Toxic: 1173
- Non-toxic: 895
