In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DATASET

In [None]:
df = pd.read_csv("/kaggle/input/dataset-pemilu-2/Dataset Pemilu 2 by crawling.csv")

In [None]:
df.info()

# DATA PREPROCESSING 

In [None]:
df.head()

In [None]:
del df['datetime']
del df['username']

In [None]:
df.head()

# CLEANING

In [None]:
df.drop_duplicates(subset ="comments", keep = 'first', inplace = True)

In [None]:
df.info()

In [None]:
import re
import string
import nltk

# Fungsi untuk menghapus URL
def remove_URL(tweet):
    if tweet is not None and isinstance(tweet, str):
        url = re.compile(r'https?://\S+|www\.\S+')
        return url.sub(r'', tweet)
    else:
        return tweet

# Fungsi untuk menghapus HTML
def remove_html(tweet):
    if tweet is not None and isinstance(tweet, str):
        html = re.compile(r'<.*?>')
        return html.sub(r'', tweet)
    else:
        return tweet

# Fungsi untuk menghapus emoji
def remove_emoji(tweet):
    if tweet is not None and isinstance(tweet, str):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F700-\U0001F77F"  # alchemical symbols
            u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            u"\U0001FA00-\U0001FA6F"  # Chess Symbols
            u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            u"\U0001F004-\U0001F0CF"  # Additional emoticons
            u"\U0001F1E0-\U0001F1FF"  # flags
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', tweet)
    else:
        return tweet

# Fungsi untuk menghapus simbol
def remove_symbols(tweet):
    if tweet is not None and isinstance(tweet, str):
        tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)  # Menghapus semua simbol
    return tweet

# Fungsi untuk menghapus angka
def remove_numbers(tweet):
    if tweet is not None and isinstance(tweet, str):
        tweet = re.sub(r'\d', '', tweet)  # Menghapus semua angka
    return tweet

df['cleaning'] = df['comments'].apply(lambda x: remove_URL(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_html(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_emoji(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_symbols(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_numbers(x))

df.head(5)

# CASE FOLDING

In [None]:
def case_folding(text):
    if isinstance(text, str):
        lowercase_text = text.lower()
        return lowercase_text
    else:
        return text

df['case_folding'] = df['cleaning'].apply(case_folding)
df.head(10)

# TOKENIZATION 

In [None]:
# Fungsi untuk tokenisasi
def tokenize(text):
    if isinstance(text, str):
        tokens = text.split()
        return tokens
    else:
        return []

df['tokenize'] = df['case_folding'].apply(tokenize)
df.head(5)

# FILTERING/ STOPWORD REMOVAL

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('indonesian')

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

df['stopword removal'] = df['tokenize'].apply(lambda x: remove_stopwords(x))

df.head(5)

# STEMMING 

In [None]:
!pip install Sastrawi



In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
    return [stemmer.stem(word) for word in text]

df['stemming_data'] = df['stopword removal'].apply(lambda x: ' '.join(stem_text(x)))

df.head(5)

In [None]:
df.info()

In [None]:
# Menghapus baris yang mengandung nilai kosong
df_cleaned = df.dropna()

In [None]:
df_cleaned.info()

In [None]:
df_cleaned.to_csv('Hasil_Preprocessing_Data2.csv',encoding='utf8', index=False)

# CLASSIFICATION

In [None]:
import pandas as pd
import nltk
import re
import numpy as np
import gensim
import torch
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel, AdamW
from transformers import pipeline

In [None]:
df = df[df['stemming_data'].apply(lambda tokens: len(tokens) > 0)]

In [None]:
roberta_classifier = pipeline('sentiment-analysis', model="ayameRushia/roberta-base-indonesian-1.5G-sentiment-analysis-smsa",tokenizer="ayameRushia/roberta-base-indonesian-1.5G-sentiment-analysis-smsa")
df['roberta_label'] = df['stemming_data'].apply(lambda x: roberta_classifier(x[:512])[0]['label'])  # Truncate to 512 tokens

In [None]:
df['roberta_label'] = df['roberta_label'].astype('category')
label_mapping = dict(enumerate(df['roberta_label'].cat.categories))

In [None]:
df.to_csv('Hasil_Labeling.csv',encoding='utf8', index=False)

In [None]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink('Hasil_Labeling.csv')

In [None]:
# Data Splitting dan CNN Model

max_length = max(len(seq) for seq in df['stemmed_tokens'])
X_pad = pad_sequences(df['stemmed_tokens'], maxlen=max_length, padding='post', truncating='post', value='0', dtype=object)
X = X_pad
y = df['roberta_label']
y

In [None]:
import os

# Assuming df['roberta_label'] is already created

# Add 'comments' column for context
df['comments'] = df['comments'].apply(lambda x: x[:512])  # Truncate to 512 tokens

# Save relevant columns to CSV in Google Drive
drive_path = '/content/drive/MyDrive'  # Adjust this path based on your Google Drive folder structure
csv_file_path = os.path.join(drive_path, 'roberta_labels.csv')
df[['comments', 'roberta_label']].to_csv(csv_file_path, index=False)
csv_file_path_1 = os.path.join(drive_path, 'Dataclean.csv')
df.to_csv(csv_file_path, index=False)

# WORD EMBEDDING

In [None]:
%%time

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 

model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34
) 

model_w2v.train(tokenized_tweet, total_examples= len(combi['tidy_tweet']), epochs=20)