# Import

In [None]:
# install Korean fonts Nanum
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.font_manager as fm
from transformers import pipeline

plt.rc('font', family='NanumBarunGothic') # set font to display Korean characters

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Read file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/cleaned_chat_data.csv')

# Chat Frequency over Time

In [None]:
def plot_messages(dataframe, frequency):
    # Resample and count messages
    resampled_data = dataframe.resample(frequency, on='Timestamp').count()

    # Plotting
    plt.figure(figsize=(10, 6))
    ax = plt.gca()  # Get current axis
    resampled_data['Content'].plot(kind='bar', ax=ax)

    # Manually setting x-tick labels
    if frequency in ['D', 'W', 'M']:
        ax.set_xticks(range(len(resampled_data)))
        ax.set_xticklabels([date.strftime('%Y-%m-%d') if frequency in ['D', 'W']
                            else date.strftime('%Y-%m')
                            for date in resampled_data.index])

    # Rotate and align the tick labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    plt.title(f'Number of Messages Sent Over Time ({frequency})')
    plt.xlabel('Time')
    plt.ylabel('Number of Messages')
    plt.tight_layout()
    plt.show()


In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M')

# plot_messages(df, 'D')  # for daily
plot_messages(df, 'W')  # for weekly
plot_messages(df, 'M')  # for monthly

# Most Active Users

In [None]:
# Group by 'Author' and count the number of posts
author_post_counts = df.groupby('Author').size()

# Sort the authors by the number of posts in descending order and select the top 20
top_authors = author_post_counts.sort_values(ascending=False).head(20)

In [None]:
# Plotting
plt.figure(figsize=(10, 6))  # Increased figure size
top_authors.plot(kind='bar', align='center')

plt.title('Number of Posts by Top 20 Authors')
plt.xlabel('Author')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45, ha='center')  # Rotate the labels vertically and align at center

plt.tight_layout()  # Adjust layout
plt.show()

## Top 20 Most Active Users by Each Month

In [None]:
# Ensure that 'Timestamp' is a DateTime object
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract the year and month from 'Timestamp'
df['YearMonth'] = df['Timestamp'].dt.to_period('M')

# Group by 'YearMonth' and 'Author', and count the number of posts
author_post_counts = df.groupby(['YearMonth', 'Author']).size().reset_index(name='PostCount')

List all the available months

In [None]:
# Get unique YearMonth values
unique_yearmonths = df['YearMonth'].unique()

# Print the unique YearMonth values
print(unique_yearmonths)

In [None]:
# Function to plot top 20 authors for a given year and month
def plot_top_authors_for_month(year_month):

    # Check if the input format is correct (e.g., '2023-05')
    try:
        year_month = pd.Period(year_month, freq='M')
    except:
        print(f"Invalid input format. Please use 'YYYY-MM' format.")
        return
    # Check if the year_month exists in the dataset
    if year_month not in df['YearMonth'].values:
        print(f"The month {year_month} does not exist in the dataset.")
        return

    # Filter the data for the given year and month
    month_data = author_post_counts[author_post_counts['YearMonth'] == year_month]

    # Get the top 20 authors for the month
    top_authors = month_data.sort_values(by='PostCount', ascending=False).head(20)

    # Plotting
    plt.figure(figsize=(10, 6))
    top_authors.set_index('Author')['PostCount'].plot(kind='bar')
    plt.title(f'Top 20 Authors in {year_month}')
    plt.xlabel('Author')
    plt.ylabel('Number of Posts')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Example usage
plot_top_authors_for_month('2024-01')  # Replace with the desired year-month

# Sentiment Analysis

Test Data

In [None]:
test_df = pd.read_excel('/content/sentiment_analysis_test.xlsx')
test_df.head()

## bert-base-cased-Korean-sentiment

LABEL_0: negative,
LABEL_1: positive.  Obtained 70% Accuracy with test dataset.

In [None]:
sentiment_pipeline = pipeline(model="WhitePeak/bert-base-cased-Korean-sentiment")

In [None]:
# Function to apply the sentiment pipeline and determine the label
def get_sentiment_label(text):
    result = sentiment_pipeline(text)[0]
    label = result['label'][-1]
    score = result['score']

    # Set label to 2 (neutral) if score is less than 0.6
    if score < 0.6:
        label = '2'
    return label, score

In [None]:
# Apply the sentiment_pipeline to each row in the 'text' column
test_df[['predicted_label', 'sentiment_score']] = test_df['Text'].apply(
    lambda x: pd.Series(get_sentiment_label(x))
)

In [None]:
# Convert 'predicted_label' from object to int
test_df['predicted_label'] = test_df['predicted_label'].astype(int)
# Compare the predicted labels with the actual labels
test_df['is_correct'] = test_df['predicted_label'] == test_df['Label']

# Calculate the accuracy
accuracy = test_df['is_correct'].mean()
print(f"Accuracy: {accuracy}")

## ChatGPT

Simply asked ChatGPT to label the test data with negative (0), positive (1), or neutral (2): Accuracy 100%.