In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load data
train_df = pd.read_csv('../data/train_processed.csv')

# 1. DATASET OVERVIEW
print("="*50)
print("DATASET OVERVIEW")
print("="*50)
print(f"Total samples: {len(train_df)}")
print(f"\nShape: {train_df.shape}")
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nMissing values:\n{train_df.isnull().sum()}")
print(f"\nData types:\n{train_df.dtypes}")

# 2. TEXT STATISTICS
print("\n" + "="*50)
print("TEXT STATISTICS")
print("="*50)
print(train_df[['text_length', 'word_count']].describe())

# Visualize text length distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(train_df['text_length'], bins=50, edgecolor='black')
axes[0].set_title('Distribution of Text Length (characters)')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')

axes[1].hist(train_df['word_count'], bins=50, edgecolor='black', color='coral')
axes[1].set_title('Distribution of Word Count')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../notebooks/text_length_distribution.png')
plt.show()

# 3. EMOTION DISTRIBUTION (TARGET VARIABLE ANALYSIS)
print("\n" + "="*50)
print("EMOTION DISTRIBUTION")
print("="*50)

emotion_counts = train_df['primary_emotion_name'].value_counts()
print(emotion_counts)

# Visualize emotion distribution
plt.figure(figsize=(14, 6))
emotion_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Emotions in Training Data', fontsize=16)
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../notebooks/emotion_distribution.png')
plt.show()

# Check for class imbalance
print(f"\nClass imbalance ratio (max/min): {emotion_counts.max() / emotion_counts.min():.2f}")

# 4. WORD FREQUENCY ANALYSIS
print("\n" + "="*50)
print("WORD FREQUENCY ANALYSIS")
print("="*50)

# Get all words
all_text = ' '.join(train_df['text'].astype(str))
words = all_text.lower().split()

# Remove stopwords
stop_words = set(stopwords.words('english'))
words_filtered = [word for word in words if word not in stop_words and len(word) > 2]

# Most common words
word_freq = Counter(words_filtered)
print("\nTop 20 most common words:")
for word, count in word_freq.most_common(20):
    print(f"{word}: {count}")

# 5. WORD CLOUDS BY EMOTION
print("\n" + "="*50)
print("GENERATING WORD CLOUDS")
print("="*50)

# Select top 6 emotions for word clouds
top_emotions = emotion_counts.head(6).index

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, emotion in enumerate(top_emotions):
    emotion_text = ' '.join(train_df[train_df['primary_emotion_name'] == emotion]['text'].astype(str))
    
    wordcloud = WordCloud(width=400, height=300, 
                         background_color='white',
                         stopwords=stop_words,
                         max_words=50).generate(emotion_text)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'{emotion.upper()}', fontsize=14, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('../notebooks/emotion_wordclouds.png')
plt.show()

# 6. TEXT LENGTH BY EMOTION
print("\n" + "="*50)
print("TEXT LENGTH BY EMOTION")
print("="*50)

plt.figure(figsize=(14, 6))
train_df.boxplot(column='text_length', by='primary_emotion_name', figsize=(14, 6))
plt.xticks(rotation=45, ha='right')
plt.title('Text Length Distribution by Emotion')
plt.suptitle('')
plt.ylabel('Character Count')
plt.tight_layout()
plt.savefig('../notebooks/text_length_by_emotion.png')
plt.show()

# 7. CORRELATION ANALYSIS
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)

# Numeric features correlation
numeric_features = ['text_length', 'word_count', 'primary_emotion']
correlation_matrix = train_df[numeric_features].corr()
print(correlation_matrix)

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('../notebooks/correlation_matrix.png')
plt.show()

# 8. SAMPLE TEXTS BY EMOTION
print("\n" + "="*50)
print("SAMPLE TEXTS BY EMOTION")
print("="*50)

for emotion in top_emotions[:3]:
    print(f"\n{emotion.upper()}:")
    samples = train_df[train_df['primary_emotion_name'] == emotion]['text'].head(3).tolist()
    for i, sample in enumerate(samples, 1):
        print(f"  {i}. {sample}")

print("\n" + "="*50)
print("EDA COMPLETE")
print("="*50)