1. FILTERING DATA

In [8]:
import pandas as pd
import os

# Buat direktori Proses jika belum ada
direktori_output = "Proses"
if not os.path.exists(direktori_output):
    os.makedirs(direktori_output)

def bersihkan_csv(nama_file):
    # Baca file CSV
    df = pd.read_csv(nama_file)
    
    # Buat mask untuk baris yang akan disimpan
    mask = (df['lang'].str.lower().str.contains('in', na=False) |
            df['location'].str.lower().str.contains('indonesia', na=False))
    
    # Simpan hanya baris yang sesuai kriteria
    df_bersih = df[mask].copy()
    
    # Reset index
    df_bersih.reset_index(drop=True, inplace=True)
    
    # Buat nama file output
    nama_dasar = os.path.basename(nama_file)
    path_output = os.path.join(direktori_output, f"Filter{nama_dasar}")
    
    # Simpan ke CSV baru
    df_bersih.to_csv(path_output, index=False)
    return len(df) - len(df_bersih)

# Daftar file yang akan diproses
file_input = [
    "DATASET/DEBAT1_CAPRES01.csv",
    "DATASET/DEBAT5_CAPRES01.csv",
    "DATASET/DEBAT1_CAPRES02.csv",
    "DATASET/DEBAT5_CAPRES02.csv",
    "DATASET/DEBAT1_CAPRES03.csv",
    "DATASET/DEBAT5_CAPRES03.csv"
]

# Proses setiap file
for file in file_input:
    try:
        baris_terhapus = bersihkan_csv(file)
        print(f'File {file} telah diproses: {baris_terhapus} baris dihapus')
        print(f'Tersimpan di: {direktori_output}/Filter{os.path.basename(file)}')
    except Exception as e:
        print(f'Terjadi kesalahan saat memproses {file}: {str(e)}')

File DATASET/DEBAT1_CAPRES01.csv telah diproses: 0 baris dihapus
Tersimpan di: Proses/FilterDEBAT1_CAPRES01.csv
File DATASET/DEBAT5_CAPRES01.csv telah diproses: 766 baris dihapus
Tersimpan di: Proses/FilterDEBAT5_CAPRES01.csv
File DATASET/DEBAT1_CAPRES02.csv telah diproses: 517 baris dihapus
Tersimpan di: Proses/FilterDEBAT1_CAPRES02.csv
File DATASET/DEBAT5_CAPRES02.csv telah diproses: 444 baris dihapus
Tersimpan di: Proses/FilterDEBAT5_CAPRES02.csv
File DATASET/DEBAT1_CAPRES03.csv telah diproses: 502 baris dihapus
Tersimpan di: Proses/FilterDEBAT1_CAPRES03.csv
File DATASET/DEBAT5_CAPRES03.csv telah diproses: 449 baris dihapus
Tersimpan di: Proses/FilterDEBAT5_CAPRES03.csv


2. GABUNGKAN FILE UNTUK SELANJUTNYA DILAKUKAN PREPROCESSING
    
    Sebelum digabungkan, akan diberikan kolom baru sebagai penanda yang memisahkan data tersebut untuk paslon keberapa

2.1. PENANDA

In [19]:
import pandas as pd

def add_paslon_column(input_path, paslon_name):
    # Baca CSV
    df = pd.read_csv(input_path)
    
    # Tambah kolom Paslon dengan nilai yang sama untuk semua baris
    df['Paslon'] = paslon_name
    
    # Simpan kembali ke file yang sama
    df.to_csv(input_path, index=False)
    
    return len(df)

def add_debat_column(input_path, debat_name):
    # Baca CSV
    df = pd.read_csv(input_path)
    
    # Tambah kolom Debat dengan nilai yang sama untuk semua baris
    df['Debat'] = debat_name
    
    # Simpan kembali ke file yang sama
    df.to_csv(input_path, index=False)
    
    return len(df)

def main():
    # Definisi file input dan nilai paslon serta debat
    files_config = {
        'Proses/FilterDEBAT1_CAPRES01.csv': ('Anies', 'Debat 1'),
        'Proses/FilterDEBAT5_CAPRES01.csv': ('Anies', 'Debat 5'),
        'Proses/FilterDEBAT1_CAPRES02.csv': ('Prabowo', 'Debat 1'),
        'Proses/FilterDEBAT5_CAPRES02.csv': ('Prabowo', 'Debat 5'),
        'Proses/FilterDEBAT1_CAPRES03.csv': ('Ganjar', 'Debat 1'),
        'Proses/FilterDEBAT5_CAPRES03.csv': ('Ganjar', 'Debat 5')
    }
    
    for file_path, (paslon, debat) in files_config.items():
        try:
            rows = add_paslon_column(file_path, paslon)
            print(f'Berhasil menambahkan kolom Paslon={paslon} pada {file_path}')
            print(f'Total {rows} baris diperbarui')
            
            rows = add_debat_column(file_path, debat)
            print(f'Berhasil menambahkan kolom Debat={debat} pada {file_path}')
            print(f'Total {rows} baris diperbarui')
        except Exception as e:
            print(f'Error saat memproses {file_path}: {str(e)}')

if __name__ == '__main__':
    main()

Berhasil menambahkan kolom Paslon=Anies pada Proses/FilterDEBAT1_CAPRES01.csv
Total 1954 baris diperbarui
Berhasil menambahkan kolom Debat=Debat 1 pada Proses/FilterDEBAT1_CAPRES01.csv
Total 1954 baris diperbarui
Berhasil menambahkan kolom Paslon=Anies pada Proses/FilterDEBAT5_CAPRES01.csv
Total 865 baris diperbarui
Berhasil menambahkan kolom Debat=Debat 5 pada Proses/FilterDEBAT5_CAPRES01.csv
Total 865 baris diperbarui
Berhasil menambahkan kolom Paslon=Prabowo pada Proses/FilterDEBAT1_CAPRES02.csv
Total 1202 baris diperbarui
Berhasil menambahkan kolom Debat=Debat 1 pada Proses/FilterDEBAT1_CAPRES02.csv
Total 1202 baris diperbarui
Berhasil menambahkan kolom Paslon=Prabowo pada Proses/FilterDEBAT5_CAPRES02.csv
Total 1294 baris diperbarui
Berhasil menambahkan kolom Debat=Debat 5 pada Proses/FilterDEBAT5_CAPRES02.csv
Total 1294 baris diperbarui
Berhasil menambahkan kolom Paslon=Ganjar pada Proses/FilterDEBAT1_CAPRES03.csv
Total 535 baris diperbarui
Berhasil menambahkan kolom Debat=Debat 1

2.2. PENGGABUNGAN

In [20]:
import pandas as pd
import os

def merge_csv_files():
    # Daftar file yang akan digabung
    files = [
        'Proses/FilterDEBAT1_CAPRES01.csv',
        'Proses/FilterDEBAT1_CAPRES02.csv',
        'Proses/FilterDEBAT1_CAPRES03.csv',
        'Proses/FilterDEBAT5_CAPRES01.csv',
        'Proses/FilterDEBAT5_CAPRES02.csv',
        'Proses/FilterDEBAT5_CAPRES03.csv',
    ]
    
    # List untuk menyimpan DataFrame
    all_data = []
    
    # Baca dan gabungkan semua file
    for file in files:
        try:
            df = pd.read_csv(file)
            all_data.append(df)
        except Exception as e:
            print(f'Error membaca file {file}: {str(e)}')
            continue
    
    # Gabungkan semua DataFrame
    merged_df = pd.concat(all_data, ignore_index=True)
    
    # Simpan hasil gabungan
    output_path = 'Proses/DATA_DEBAT_TOTAL.csv'
    merged_df.to_csv(output_path, index=False)
    
    return len(merged_df)

# Jalankan penggabungan
total_rows = merge_csv_files()
print(f'Penggabungan selesai: {total_rows} baris data tersimpan di DATA_DEBAT_TOTAL.csv')

Penggabungan selesai: 6581 baris data tersimpan di DATA_DEBAT_TOTAL.csv


3. TAHAP PREPROCESSING

    Pada tahap ini akan dilakukan preprocessing sebanyak 7 kali untuk file yang telah digabungkan

3.1. CLEAN DATA

In [12]:
import pandas as pd
import re

def clean_text(text):
    if isinstance(text, str):
        # Hapus URL
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Hapus mention (@user)
        text = re.sub(r'@\w+', '', text)
        
        # Hapus hashtag (#)
        text = re.sub(r'#\w+', '', text)
        
        # Hapus RT dan FAV
        text = re.sub(r'RT|FAV', '', text)
        
        # Hapus simbol dan tanda baca
        text = re.sub(r'[^\w\s]', '', text)
        
        # Hapus angka
        text = re.sub(r'\d+', '', text)
        
        # Hapus multiple whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Convert ke lowercase
        text = text.lower().strip()
        
        return text
    return ''

def main():
    # Baca file CSV
    df = pd.read_csv('Proses/DATA_DEBAT_TOTAL.csv')
    
    # Bersihkan teks pada kolom 'full_text'
    df['Clean'] = df['full_text'].apply(clean_text)
    
    # Simpan hasil
    df.to_csv('Proses/DATA_DEBAT_TOTAL.csv', index=False)
    print('Clean data selesai dan tersimpan di kolom Clean')

if __name__ == '__main__':
    main()

Clean data selesai dan tersimpan di kolom Clean


3.2. TOKENIZING

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_text(text):
    if isinstance(text, str):
        # Tokenisasi teks
        tokens = word_tokenize(text)
        return tokens
    return []
def main():
    # Baca file CSV
    df = pd.read_csv('Proses/DATA_DEBAT_TOTAL.csv')
    
    # Tokenisasi teks dari kolom 'Clean'
    df['Tokens'] = df['Clean'].apply(tokenize_text)
    
    # Simpan hasil
    df.to_csv('Proses/DATA_DEBAT_TOTAL.csv', index=False)
    print('Tokenisasi selesai dan tersimpan di kolom Tokens')
if __name__ == '__main__':
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\olgab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenisasi selesai dan tersimpan di kolom Tokens


3.3. NORMALISASI

In [None]:
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def normalize_text(tokens):
    if isinstance(tokens, list):
        # Inisialisasi stemmer
        stemmer_factory = StemmerFactory()
        stemmer = stemmer_factory.create_stemmer()
        
        # Stemming setiap token
        normalized_tokens = [stemmer.stem(word) for word in tokens]
        
        return normalized_tokens
    return []
def main():
    # Baca file CSV
    df = pd.read_csv('Proses/DATA_DEBAT_TOTAL.csv')
    
    # Normalisasi tokens
    df['Normalized'] = df['Tokens'].apply(eval).apply(normalize_text)
    
    # Simpan hasil
    df.to_csv('Proses/DATA_DEBAT_TOTAL.csv', index=False)
    print('Normalisasi selesai dan tersimpan di kolom Normalized')

if __name__ == '__main__':
    main()

Normalisasi selesai dan tersimpan di kolom Normalized


3.4. STOPWORD REMOVAL

In [None]:
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

def remove_stopwords(tokens):
    if isinstance(tokens, list):
        # Get Indonesian stopwords
        stop_factory = StopWordRemoverFactory()
        stopwords = stop_factory.get_stop_words()
        
        # Remove stopwords from tokens
        tokens_without_stopwords = [word for word in tokens if word not in stopwords]
        
        return tokens_without_stopwords
    return []
def main():
    # Read CSV
    df = pd.read_csv('Proses/DATA_DEBAT_TOTAL.csv')
    
    # Remove stopwords from normalized tokens
    df['Stopword'] = df['Normalized'].apply(eval).apply(remove_stopwords)
    
    # Save results
    df.to_csv('Proses/DATA_DEBAT_TOTAL.csv', index=False)
    print('Stopword removal selesai dan tersimpan di kolom Stopword')

if __name__ == '__main__':
    main()

Stopword removal selesai dan tersimpan di kolom Stopword


3.5. STEMMING

In [16]:
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stem_words(tokens):
    if isinstance(tokens, list):
        # Initialize Sastrawi stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        
        # Stem each token
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        
        return stemmed_tokens
    return []

def main():
    # Read CSV
    df = pd.read_csv('Proses/DATA_DEBAT_TOTAL.csv')
    
    # Apply stemming to tokens after stopword removal
    df['Stemming'] = df['Stopword'].apply(eval).apply(stem_words)
    
    # Save results
    df.to_csv('Proses/DATA_DEBAT_TOTAL.csv', index=False)
    print('Stemming selesai dan tersimpan di kolom Stemming')

if __name__ == '__main__':
    main()

Stemming selesai dan tersimpan di kolom Stemming


In [18]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Dataset/DEBAT1_CAPRES01.csv')

# Print the number of columns and their names
print("Number of columns:", len(df.columns))
print("Column names:", df.columns)

# Display data for 'full_text' and 2 other columns (replace 'other_column1' and 'other_column2' with actual column names)
columns_to_display = ['full_text', 'lang', 'location']
print(df[columns_to_display].sample(4))


Number of columns: 15
Column names: Index(['conversation_id_str', 'created_at', 'favorite_count', 'full_text',
       'id_str', 'image_url', 'in_reply_to_screen_name', 'lang', 'location',
       'quote_count', 'reply_count', 'retweet_count', 'tweet_url',
       'user_id_str', 'username'],
      dtype='object')
                                              full_text lang      location
1750  VIDEO: Cak Imin Serang Balik Prabowo: Emang Et...   in     Indonesia
409   @gadisberjilbabb @aniesbaswedan Pemimpin itu K...   in     Indonesia
950   @ekky1995 Kalau Prabowo Gibran yg menang surve...   in  di hati kamu
227   Kembali kepada mata panda sebab asyik tidur la...   in           NaN
