# Preparation

Install Korean fonts Nanum for Google Colab, needs to restart runtime after installation

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.font_manager as fm
from transformers import pipeline
from collections import Counter
from konlpy.tag import Okt
from wordcloud import WordCloud
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

plt.rc('font', family='NanumBarunGothic') # set font to display Korean characters

## File Processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/cleaned_chat_data.csv')

### Content Validity Check

In [None]:
# Check if each row in the 'Content' column is valid (not NaN or None)
df['isValidContent'] = df['Content'].apply(lambda x: not pd.isna(x))

# Count the number of lines
num_lines = df.shape[0]
# Count the number of valid lines
num_valid_lines = df['isValidContent'].sum()

# Print the number of valid lines
print(f"Number of lines: {num_lines}, Number of valid lines in 'Content': {num_valid_lines}")

# Drop the 'isValidContent' column from the DataFrame
df.drop(columns=['isValidContent'], inplace=True)

### Monthly Analysis

In [None]:
# Ensure that 'Timestamp' is a DateTime object
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract the year and month from 'Timestamp'
df['YearMonth'] = df['Timestamp'].dt.to_period('M')

# List all the available months
# Get unique YearMonth values
unique_yearmonths = df['YearMonth'].unique()
print(unique_yearmonths)

In [None]:
def filter_data_by_month(df, year_month_str):
    """Filter the DataFrame for the given year and month."""
    try:
        year_month = pd.Period(year_month_str, freq='M')
    except:
        print(f"Invalid input format. Please use 'YYYY-MM' format.")
        return None

    if year_month not in df['YearMonth'].values:
        print(f"The month {year_month} does not exist in the dataset.")
        return None

    # Filter the data for the given year and month
    return df[df['YearMonth'] == year_month]

## Tokenizer

### Stop Words

In [None]:
# Define a list of words to be excluded
stop_words = set(['저', '거', '뭐', '것', '그', '수', '더', '지금', '분', '그냥', '요',
                  '제', '때', '혹시', '왜', '이', '좀', '해', '네', '안',
                  '가요', '다시', '해당', '용', '넵', '전', '오', '또', '개',
                  '오늘', '정도', '말씀', '말', '나', '내', '건가', '명',
                  '넹', '은', '및', '알', '데', '중', '도', '건', '로',
                  '게', '를', '여'])

### Okt Tokenizer

In [None]:
# Initialize the tokenizer
okt = Okt()

def tokenize(text):
    # Extract nouns
    return [word for word in okt.nouns(text) if word not in stop_words]

# Chat Frequency over Time

In [None]:
def plot_messages(dataframe, frequency):
    # Resample and count messages
    resampled_data = dataframe.resample(frequency, on='Timestamp').count()

    # Plotting
    plt.figure(figsize=(10, 6))
    ax = plt.gca()  # Get current axis
    resampled_data['Content'].plot(kind='bar', ax=ax)

    # Manually setting x-tick labels
    if frequency in ['D', 'W', 'M']:
        ax.set_xticks(range(len(resampled_data)))
        ax.set_xticklabels([date.strftime('%Y-%m-%d') if frequency in ['D', 'W']
                            else date.strftime('%Y-%m')
                            for date in resampled_data.index])

    # Rotate and align the tick labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    plt.title(f'Number of Messages Sent Over Time ({frequency})')
    plt.xlabel('Time')
    plt.ylabel('Number of Messages')
    plt.tight_layout()
    plt.show()


In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M')

# plot_messages(df, 'D')  # for daily
plot_messages(df, 'W')  # for weekly
plot_messages(df, 'M')  # for monthly

# Most Active Users

In [None]:
# Group by 'Author' and count the number of posts
author_post_counts = df.groupby('Author').size()

# Sort the authors by the number of posts in descending order and select the top 20
top_authors = author_post_counts.sort_values(ascending=False).head(20)

In [None]:
# Plotting
plt.figure(figsize=(10, 6))  # Increased figure size
top_authors.plot(kind='bar', align='center')

plt.title('Number of Posts by Top 20 Authors')
plt.xlabel('Author')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()  # Adjust layout
plt.show()

## Top 20 Active Users per Month

In [None]:
# Group by 'YearMonth' and 'Author', and count the number of posts
author_post_counts = df.groupby(['YearMonth', 'Author']).size().reset_index(name='PostCount')

In [None]:
# Function to plot top 20 authors for a given year and month
def plot_top_authors_for_month(df, year_month_str):
    month_data = filter_data_by_month(df, year_month_str)
    if month_data is None:
        return

    # Get the top 20 authors for the month
    top_authors = month_data.sort_values(by='PostCount', ascending=False).head(20)

    # Plotting
    plt.figure(figsize=(10, 6))
    top_authors.set_index('Author')['PostCount'].plot(kind='bar')
    plt.title(f'Top 20 Authors in {year_month_str}')
    plt.xlabel('Author')
    plt.ylabel('Number of Posts')
    plt.xticks(rotation=45, ha='right')
    plt.show()

In [None]:
# Example usage
plot_top_authors_for_month(author_post_counts, '2023-07')  # Replace with the desired year-month

# Term Frequency

## Top 20 Terms per Month

### Bar Chart

In [None]:
# Function to generate bar chart for top frequent terms for a given year and month
def plot_top_terms_for_month(df, year_month_str):
    month_data = filter_data_by_month(df, year_month_str)
    if month_data is None:
        return

    # Tokenize and count terms
    terms = month_data['Content'].apply(tokenize).sum()
    term_counts = Counter(terms)

    # Get the most common terms
    most_common_terms = term_counts.most_common(20)

    # Prepare data for visualization
    terms, counts = zip(*most_common_terms)

    # Create a bar chart
    plt.figure(figsize=(10, 6))
    plt.bar(terms, counts)
    plt.title(f'Top 20 Frequent Terms in {year_month_str}')
    plt.xlabel('Terms')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Example usage
plot_top_terms_for_month(df, '2023-12')  # Replace with the desired year-month

### Word Cloud

In [None]:
# Load mask image
mask_image = np.array(Image.open('/content/bear.jpg'))

In [None]:
# Function to generate a word cloud for a given year and month
def generate_wordcloud_for_month(df, year_month_str, mask):
    month_data = filter_data_by_month(df, year_month_str)
    if month_data is None:
        return

    # Tokenize and count terms, filtering out stop words
    terms = month_data['Content'].apply(tokenize).sum()
    term_string = ' '.join(terms)  # Join all terms into a single string

    # Generate a word cloud
    wordcloud = WordCloud(font_path='/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf',
                          width=800, height=800,
                          background_color='white',
                          max_font_size=160,
                          mask=mask).generate(term_string)

    # Display the word cloud
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {year_month_str}')
    plt.show()

In [None]:
# Example usage
generate_wordcloud_for_month(df, '2023-12', mask_image)  # Replace with the desired year-month

# Sentiment Analysis

## Test Data

In [None]:
test_df = pd.read_excel('/content/sentiment_analysis_test.xlsx')
test_df.head()

## WhitePeak/bert-base-cased-Korean-sentiment

LABEL_0: negative,
LABEL_1: positive.  

Accuracy Result on test dataset: 70%.

In [None]:
sentiment_pipeline = pipeline(model="WhitePeak/bert-base-cased-Korean-sentiment")

In [None]:
# Function to apply the sentiment pipeline and determine the label
def get_sentiment_label(text):
    # Truncate the text to the maximum length of the model (512 tokens)
    result = sentiment_pipeline(text, truncation=True)[0]
    label = result['label'][-1]
    score = result['score']

    # Set label to 2 (neutral) if score is less than 0.6
    if score < 0.6:
        label = '2'
    return label

Apply to test dataset

In [None]:
# Apply the sentiment_pipeline to each row in the 'text' column
test_df[['predicted_label']] = test_df['Text'].apply(
    lambda x: pd.Series(get_sentiment_label(x))
)

In [None]:
# Convert 'predicted_label' from object to int
test_df['predicted_label'] = test_df['predicted_label'].astype(int)
# Compare the predicted labels with the actual labels
test_df['is_correct'] = test_df['predicted_label'] == test_df['Label']

# Calculate the accuracy
accuracy = test_df['is_correct'].mean()
print(f"Accuracy: {accuracy}")

Apply to original dataset

In [None]:
# Apply the sentiment_pipeline to each row in the 'Content' column
df[['Sentiment']] = df['Content'].apply(
    lambda x: pd.Series(get_sentiment_label(x))
)

print(df.head())

# Save the result back to the CSV
df.to_csv('/content/drive/MyDrive/cleaned_chat_data_with_sentiments.csv', index=False, encoding='utf-8-sig')

## ChatGPT - GPT4

Simply asked GPT-4 to label the test data with negative (0), positive (1), or neutral (2)

Accuracy Result on test dataset: 100%.


**Reference:**

* Gilardi, Fabrizio, Meysam Alizadeh, and Maël Kubli. "Chatgpt outperforms crowd-workers for text-annotation tasks." arXiv preprint arXiv:2303.15056 (2023).

## Visualization

In [None]:
# Read the data with sentiment
df_sentiment = pd.read_csv('/content/drive/MyDrive/cleaned_chat_data_with_sentiments.csv', encoding='utf-8-sig')

In [None]:
# Map the numerical labels to descriptive names
sentiment_label_map = {
    0: 'Negative',
    1: 'Positive',
    2: 'Neutral'
}
df_sentiment['Sentiment'] = df_sentiment['Sentiment'].map(sentiment_label_map)

# Count the frequency of each sentiment label
sentiment_counts = df_sentiment['Sentiment'].value_counts()

In [None]:
# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Sentiments', pad=20)
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
plt.show()

# Topic Modeling

## Latent Dirichlet Allocation (LDA)

Visuaization

In [None]:
def plot_top_words(model, n_topics, feature_names, n_top_words, title):
    fig, axes = plt.subplots(1, n_topics, figsize=(25, 5), sharex=True) # 1 row, n_topics subfigures
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 18})
        ax.tick_params(axis="both", which="major", labelsize=15)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=24, y=1.05) # Increasing the y value moves the title higher

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

LDA Analysis for a Specific Month

* Here we treat each text as a document

In [None]:
def lda_analysis_for_month(df, year_month_str, n_topics=5, n_top_words=10):
    # Filter data for the specified month
    month_data = filter_data_by_month(df, year_month_str)
    if month_data is None:
        return

    month_data_LDA = month_data.copy() # create a copy of dataframe for analysis

    # Tokenize and prepare text data
    # Join tokens into a string for LDA analysis
    month_data_LDA['tokenized_content'] = month_data_LDA['Content'].apply(
        lambda x: ' '.join(tokenize(x)) if pd.notnull(x) else ''
    )

    # Convert a collection of texts to a matrix of token counts
    # vocabulary with top max_features, appearing more than min_df times
    vectorizer = CountVectorizer(
        max_df=0.95, min_df=2,
        max_features=2500
        )
    X = vectorizer.fit_transform(month_data_LDA['tokenized_content'])

    # Perform LDA
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=50, # epochs
        random_state=0 # for reproducible results
        )
    t0 = time()
    lda.fit(X)
    print("LDA done in %0.3fs." % (time() - t0))

    # Plot the top words for each topic
    plot_top_words(
        lda, n_topics, vectorizer.get_feature_names_out(), n_top_words,
        f"Top {n_top_words} words for topics in {year_month_str}"
        )

In [None]:
lda_analysis_for_month(df, '2023-12', n_topics=5, n_top_words=10)