In [None]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import seaborn as sns

import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('DataFix_Preprocessed.csv')

# Convert date column to datetime (assuming you have a date column)
df['date'] = pd.to_datetime(df['publish_date'])
df['month'] = df['date'].dt.to_period('M')

# Function to get word frequencies for each month
def get_monthly_word_freq(data):
    monthly_word_freq = {}
    
    for month in data['month'].unique():
        # Get text data for the specific month
        month_text = ' '.join(data[data['month'] == month]['text'].astype(str))
        # Split into words and count frequencies
        words = month_text.split()
        word_freq = Counter(words)
        # Get top 10 words
        top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10])
        monthly_word_freq[month] = top_words
    
    return monthly_word_freq

# Calculate word frequencies by month
monthly_frequencies = get_monthly_word_freq(df)

# Plot top words for each month
plt.figure(figsize=(15, 8))
for month, word_freq in monthly_frequencies.items():
    print(f"\nTop words for {month}:")
    for word, freq in word_freq.items():
        print(f"{word}: {freq}")
    
    # Create word cloud for each month
    plt.figure(figsize=(10, 5))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    plt.imshow(wordcloud)
    plt.title(f'Word Cloud for {month}')
    plt.axis('off')
    plt.show()
    
dari code ini buat penyesuaian untuk tidak menampilkan word cloud cuman menampilkan top 10 kata per bulan dalam bentuk tabel serta jangan sertakan kata kata tertetnu nanti saya list 

In [None]:
# Calculate word frequencies by month
monthly_frequencies = get_monthly_word_freq(df, words_to_exclude)

# Print results month by month
for month in sorted(monthly_frequencies.keys()):
    print(f"\nTop 10 words for {month}:")
    # Create a formatted table-like output
    print("-" * 40)
    print(f"{'Word':<20} {'Frequency':>10}")
    print("-" * 40)
    for word, freq in monthly_frequencies[month].items():
        print(f"{word:<20} {freq:>10}")

In [None]:
import pandas as pd
from collections import Counter

# Function to get word frequencies for each month
def get_monthly_word_freq(data, words_to_exclude):
    monthly_word_freq = {}
    
    for month in data['month'].unique():
        # Get text data for the specific month
        month_text = ' '.join(data[data['month'] == month]['text'].astype(str))
        # Split into words and count frequencies
        words = month_text.split()
        # Filter out excluded words
        filtered_words = [word for word in words if word.lower() not in words_to_exclude]
        word_freq = Counter(filtered_words)
        # Get top 10 words
        top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10])
        monthly_word_freq[month] = top_words
    
    return monthly_word_freq

# List of words to exclude (add your words here)
words_to_exclude = ['di', 'ke', 'dari', 'dan', 'yang', 'yg', 'ada', 'ini', 'itu', 'ya', 'bisa', 'ga', 'gak', '']

# Calculate word frequencies by month
monthly_frequencies = get_monthly_word_freq(df, words_to_exclude)

# Print results month by month
for month in sorted(monthly_frequencies.keys()):
    print(f"\nTop 10 words for {month}:")
    # Create a formatted table-like output
    print("-" * 40)
    print(f"{'Word':<20} {'Frequency':>10}")
    print("-" * 40)
    for word, freq in monthly_frequencies[month].items():
        print(f"{word:<20} {freq:>10}")

bagaimana cara buat agar print 1 kata yang sering lalu kalimat (2 kata) yang sering muncul di setiap bulan lalu kalimat (3 kata) yang sering muncul di setiap bulan


Top 10 words for 2024-06:
----------------------------------------
Word                  Frequency
----------------------------------------
Marathon                     18
mau                          18
🔥🔥🔥                          18
Maybank                      17
@maybankmarathon             17
HM                           16
marathon                     16
gak                          16
🔥                            14
nya                          13

Top 10 words for 2024-07:
----------------------------------------
Word                  Frequency
----------------------------------------
ga                           39
WTB                          34
😂                            34
ikut                         31
mau                          29
🔥🔥🔥                          29
Maybank                      27
FM                           25
kak                          25
masih                        24

Top 10 words for 2024-08:
----------------------------------------
Word      

In [6]:
import pandas as pd
from collections import Counter

def get_ngrams(text, n):
    words = text.split()
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def get_monthly_phrase_freq(data, words_to_exclude, top_n=5):
    monthly_frequencies = {}
    
    for month in data['month'].unique():
        # Get text data for the specific month
        month_texts = data[data['month'] == month]['text'].astype(str).tolist()
        
        # Initialize counters for different n-grams
        unigrams = Counter()
        bigrams = Counter()
        trigrams = Counter()
        
        for text in month_texts:
            # Clean and filter words
            words = [word for word in text.split() 
                    if word.lower() not in words_to_exclude]
            text_clean = ' '.join(words)
            
            # Count frequencies
            unigrams.update(get_ngrams(text_clean, 1))
            bigrams.update(get_ngrams(text_clean, 2))
            trigrams.update(get_ngrams(text_clean, 3))
        
        monthly_frequencies[month] = {
            'unigrams': dict(sorted(unigrams.items(), key=lambda x: x[1], reverse=True)[:top_n]),
            'bigrams': dict(sorted(bigrams.items(), key=lambda x: x[1], reverse=True)[:top_n]),
            'trigrams': dict(sorted(trigrams.items(), key=lambda x: x[1], reverse=True)[:top_n])
        }
    
    return monthly_frequencies

# Calculate frequencies
monthly_phrase_freq = get_monthly_phrase_freq(df, words_to_exclude)

# Print results
for month in sorted(monthly_phrase_freq.keys()):
    print(f"\n=== Month: {month} ===")
    
    print("\nTop 5 Single Words:")
    print("-" * 50)
    for word, freq in monthly_phrase_freq[month]['unigrams'].items():
        print(f"{word:<30} {freq:>10}")
    
    print("\nTop 5 Two-Word Phrases:")
    print("-" * 50)
    for phrase, freq in monthly_phrase_freq[month]['bigrams'].items():
        print(f"{phrase:<30} {freq:>10}")
    
    print("\nTop 5 Three-Word Phrases:")
    print("-" * 50)
    for phrase, freq in monthly_phrase_freq[month]['trigrams'].items():
        print(f"{phrase:<30} {freq:>10}")
    
    print("\n" + "="*50)


=== Month: 2024-06 ===

Top 5 Single Words:
--------------------------------------------------
Marathon                               18
mau                                    18
🔥🔥🔥                                    18
Maybank                                17
@maybankmarathon                       17

Top 5 Two-Word Phrases:
--------------------------------------------------
Maybank Marathon                       11
maybank marathon                        7
Long Run                                4
WTB 10k                                 4
1 slot                                  4

Top 5 Three-Word Phrases:
--------------------------------------------------
Sunday Long Run                         3
to Maybank Marathon                     3
penginapan dekat start                  3
dekat start line                        3
Mandiri Jogja Marathon                  2


=== Month: 2024-07 ===

Top 5 Single Words:
--------------------------------------------------
ga                     