In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv('DataFix_Translate.csv', encoding='utf-8')

stop_words = set(stopwords.words('indonesian'))
stemmer = StemmerFactory().create_stemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
slang_map = {
    'gak': 'tidak', 'ga': 'tidak', 'nggak': 'tidak',
    'nya': '', 'sih': '', 'aja': 'saja',
}
slang_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, slang_map)) + r')\b', flags=re.IGNORECASE)

def normalize_slang(text):
    return slang_pattern.sub(lambda m: slang_map[m.group(0).lower()], text)

def reduce_lengthening(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def preprocess_id(text):
    txt = text.lower()
    txt = re.sub(r'http\S+|www\.\S+|#\w+', ' ', txt)
    txt = normalize_slang(txt)
    txt = reduce_lengthening(txt)
    txt = re.sub(r'[^a-z0-9\s]', ' ', txt)
    txt = re.sub(r'\s+', ' ', txt).strip()
    tokens = nltk.word_tokenize(txt)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

df['tokens_id'] = df['text_id'].fillna('').astype(str).apply(preprocess_id)

df.to_csv('DataFix_Preprocessed.csv', index=False, encoding='utf-8')

print(df[['text_id','tokens_id']].head(20).to_string(index=False))

                                                                                                                                                                text_id                                                                                                                                   tokens_id
                                                                                                                                        wtb HM laki-laki dilipat tangan                                                                                                        [wtb, hm, laki, laki, lipat, tangan]
                                                                                                                                                                api api                                                                                                                                  [api, api]
                                                                            

In [None]:
def plot_sentiment_timeline_monthly(data, title):
    # Group by month and rating, count occurrences
    counts_per_month = data.groupby([data['publish_date'].dt.to_period('M'), 'Rating']).size().reset_index(name='jumlah')
    counts_per_month['publish_date'] = counts_per_month['publish_date'].astype(str)
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=counts_per_month, x='publish_date', y='jumlah', hue='Rating', marker='o')
    plt.title(title)
    plt.xlabel('Bulan')
    plt.ylabel('Jumlah')
    plt.legend(title='Rating')
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Create the plots
if not sebelum_hari_h.empty:
    plot_sentiment_timeline_monthly(sebelum_hari_h, 'Distribusi Rating Bulanan Sebelum Hari H')
if not sesudah_hari_h.empty:
    plot_sentiment_timeline_monthly(sesudah_hari_h, 'Distribusi Rating Bulanan Sesudah Hari H')