# Import

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from transformers import pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/cleaned_chat_data.csv')

# Chat Frequency over Time

In [None]:
def plot_messages(dataframe, frequency):
    # Resample and count messages
    resampled_data = dataframe.resample(frequency, on='Timestamp').count()

    # Plotting
    plt.figure(figsize=(10, 6))
    ax = plt.gca()  # Get current axis
    resampled_data['Content'].plot(kind='bar', ax=ax)

    # Manually setting x-tick labels
    if frequency in ['D', 'W', 'M']:
        ax.set_xticks(range(len(resampled_data)))
        ax.set_xticklabels([date.strftime('%Y-%m-%d') if frequency in ['D', 'W']
                            else date.strftime('%Y-%m')
                            for date in resampled_data.index])

    # Rotate and align the tick labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    plt.title(f'Number of Messages Sent Over Time ({frequency})')
    plt.xlabel('Time')
    plt.ylabel('Number of Messages')
    plt.tight_layout()
    plt.show()


In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M')

# plot_messages(df, 'D')  # for daily
plot_messages(df, 'W')  # for weekly
plot_messages(df, 'M')  # for monthly

# Sentiment Analysis

Test Data

In [None]:
test_df = pd.read_excel('/content/sentiment_analysis_test.xlsx')
test_df.head()

## bert-base-cased-Korean-sentiment

LABEL_0: negative,
LABEL_1: positive.  Obtained 70% Accuracy with test dataset.

In [None]:
sentiment_pipeline = pipeline(model="WhitePeak/bert-base-cased-Korean-sentiment")

In [None]:
# Function to apply the sentiment pipeline and determine the label
def get_sentiment_label(text):
    result = sentiment_pipeline(text)[0]
    label = result['label'][-1]
    score = result['score']

    # Set label to 2 (neutral) if score is less than 0.6
    if score < 0.6:
        label = '2'
    return label, score

In [None]:
# Apply the sentiment_pipeline to each row in the 'text' column
test_df[['predicted_label', 'sentiment_score']] = test_df['Text'].apply(
    lambda x: pd.Series(get_sentiment_label(x))
)

In [None]:
# Convert 'predicted_label' from object to int
test_df['predicted_label'] = test_df['predicted_label'].astype(int)
# Compare the predicted labels with the actual labels
test_df['is_correct'] = test_df['predicted_label'] == test_df['Label']

# Calculate the accuracy
accuracy = test_df['is_correct'].mean()
print(f"Accuracy: {accuracy}")

## ChatGPT

Simply asked ChatGPT to label the test data with negative (0), positive (1), or neutral (2): Accuracy 100%.