# ðŸ“Š Exploratory Data Analysis (EDA) on YouTube Comments

This notebook focuses on performing exploratory data analysis on the comments collected from YouTube regarding Indonesia's national football team. The goal is to visualize sentiment distribution and other relevant statistics.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print('âœ… Libraries imported successfully!')

In [None]:
# Load the processed dataset
processed_df = pd.read_csv('../data/processed/final_processed_dataset_real.csv')

print(f'ðŸ“Š Processed dataset loaded with {len(processed_df)} comments.')

In [None]:
# Analyze sentiment distribution
sentiment_dist = processed_df['sentiment_auto'].value_counts()

plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_dist.index, y=sentiment_dist.values, palette='viridis')
plt.title('Sentiment Distribution of YouTube Comments', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Number of Comments', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Analyze text length distribution
plt.figure(figsize=(8, 6))
sns.histplot(processed_df['text_length'], bins=30, kde=True, color='skyblue')
plt.title('Text Length Distribution of Comments', fontsize=16)
plt.xlabel('Text Length (Characters)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze like count by sentiment
plt.figure(figsize=(8, 6))
sns.boxplot(data=processed_df, x='sentiment_auto', y='like_count', palette='Set2')
plt.title('Like Count by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Like Count', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze comments by channel
top_channels = processed_df['channel_title'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_channels.values, y=top_channels.index, palette='pastel')
plt.title('Top 10 Channels by Comment Count', fontsize=16)
plt.xlabel('Comment Count', fontsize=14)
plt.ylabel('Channel Title', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Save EDA results to CSV
eda_results = processed_df[['text_length', 'like_count', 'sentiment_auto']]
eda_results.to_csv('../results/eda_results.csv', index=False)
print('âœ… EDA results saved to: ../results/eda_results.csv')