# Exploratory Data Analysis: Stance Detection

Understanding the data before building models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

## Load Data

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.columns = train_df.columns.str.replace('\ufeff', '')
test_df.columns = test_df.columns.str.replace('\ufeff', '')

print(f"Train: {len(train_df)}, Test: {len(test_df)}")
train_df.head(10)

## Examine Each Stance

In [None]:
for stance in ['FAVOR', 'AGAINST', 'NONE']:
    print(f"\n{'='*80}")
    print(f"{stance} Examples")
    print('='*80)
    samples = train_df[train_df['Stance'] == stance].sample(n=3, random_state=42)
    for idx, row in samples.iterrows():
        print(f"\n{row['Target']}")
        print(f"{row['Tweet']}")
        print(f"Sentiment: {row['Sentiment']}")

## Critical Finding: Stance ≠ Sentiment

In [None]:
confusion = pd.crosstab(train_df['Stance'], train_df['Sentiment'], normalize='index') * 100

print("Sentiment Distribution within Each Stance (%)")
print(confusion.round(1))

fig, ax = plt.subplots(figsize=(10, 6))
confusion.plot(kind='bar', ax=ax, stacked=True)
ax.set_title('Stance vs Sentiment Mismatch', fontsize=14, fontweight='bold')
ax.set_xlabel('Stance')
ax.set_ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()

against_positive = len(train_df[(train_df['Stance'] == 'AGAINST') & (train_df['Sentiment'] == 'POSITIVE')])
total_against = len(train_df[train_df['Stance'] == 'AGAINST'])
print(f"\n{against_positive}/{total_against} ({against_positive/total_against*100:.1f}%) AGAINST tweets have POSITIVE sentiment")
print("Sentiment analysis alone will fail.")

## AGAINST + POSITIVE Examples

In [None]:
mismatch = train_df[(train_df['Stance'] == 'AGAINST') & (train_df['Sentiment'] == 'POSITIVE')].sample(n=10, random_state=42)

print("AGAINST Stance + POSITIVE Sentiment:")
for idx, row in mismatch.iterrows():
    print(f"\n{row['Target']}")
    print(f"{row['Tweet']}")
    print(f"Opinion: {row['Opinion towards']}")

## Class Distribution

In [None]:
stance_counts = train_df['Stance'].value_counts()
print("Training Set:")
for stance, count in stance_counts.items():
    print(f"  {stance:8s}: {count:4d} ({count/len(train_df)*100:5.1f}%)")

print(f"\nImbalance Ratio: {stance_counts.max() / stance_counts.min():.2f}:1")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

stance_counts.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#3498db', '#95a5a6'])
axes[0].set_title('Training Set', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
for i, v in enumerate(stance_counts):
    axes[0].text(i, v + 30, f'{v}\n({v/len(train_df)*100:.1f}%)', ha='center', fontweight='bold')

test_stance_counts = test_df['Stance'].value_counts()
test_stance_counts.plot(kind='bar', ax=axes[1], color=['#e74c3c', '#3498db', '#95a5a6'])
axes[1].set_title('Test Set', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
for i, v in enumerate(test_stance_counts):
    axes[1].text(i, v + 15, f'{v}\n({v/len(test_df)*100:.1f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nClass imbalance requires SMOTE or class weights.")

## Stance by Target

In [None]:
stance_by_target = pd.crosstab(train_df['Target'], train_df['Stance'], normalize='index') * 100
print("Stance Distribution by Target (%)")
print(stance_by_target.round(1))

fig, ax = plt.subplots(figsize=(12, 6))
stance_by_target.plot(kind='bar', ax=ax, color=['#e74c3c', '#3498db', '#95a5a6'])
ax.set_title('Stance by Target', fontsize=14, fontweight='bold')
ax.set_ylabel('Percentage')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Stance')
plt.tight_layout()
plt.show()

print("\nClimate Change is mostly FAVOR - target is a strong feature.")

## Top Words per Stance

In [None]:
def get_words(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return [w for w in text.split() if len(w) > 2]

for stance in ['AGAINST', 'FAVOR', 'NONE']:
    texts = train_df[train_df['Stance'] == stance]['Tweet']
    all_words = []
    for text in texts:
        all_words.extend(get_words(text))
    
    word_freq = Counter(all_words)
    print(f"\n{stance} - Top 20 Words:")
    for word, count in word_freq.most_common(20):
        print(f"  {word:20s}: {count:4d}")

## Distinctive Words per Stance

In [None]:
stance_words = {}
for stance in ['AGAINST', 'FAVOR', 'NONE']:
    texts = train_df[train_df['Stance'] == stance]['Tweet']
    all_words = []
    for text in texts:
        all_words.extend(get_words(text))
    stance_words[stance] = Counter(all_words)

def get_distinctive_words(target_stance, other_stances, top_n=15):
    target_freq = stance_words[target_stance]
    ratios = {}
    
    for word in target_freq:
        if target_freq[word] < 10:
            continue
        
        other_freq = sum([stance_words[s][word] for s in other_stances])
        if other_freq == 0:
            ratio = float('inf')
        else:
            ratio = target_freq[word] / (other_freq / len(other_stances))
        ratios[word] = ratio
    
    return sorted(ratios.items(), key=lambda x: x[1], reverse=True)[:top_n]

print("Distinctive Words per Stance:")
for stance in ['AGAINST', 'FAVOR', 'NONE']:
    others = [s for s in ['AGAINST', 'FAVOR', 'NONE'] if s != stance]
    distinctive = get_distinctive_words(stance, others, top_n=15)
    
    print(f"\n{stance}:")
    for word, ratio in distinctive:
        count = stance_words[stance][word]
        print(f"  {word:20s}: {count:4d} ({ratio:.1f}x more frequent)")

## NONE Stance Analysis

In [None]:
none_samples = train_df[train_df['Stance'] == 'NONE'].sample(n=15, random_state=42)

print("NONE Stance Examples:")
for idx, row in none_samples.iterrows():
    print(f"\n{row['Target']}")
    print(f"{row['Tweet']}")
    print(f"Sentiment: {row['Sentiment']}, Opinion: {row['Opinion towards']}")

## Key Findings

1. **Stance ≠ Sentiment** - 30.9% of AGAINST tweets have POSITIVE sentiment
2. **Class imbalance** - 2.26:1 ratio requires SMOTE
3. **Target matters** - Climate Change is mostly FAVOR
4. **NONE is hardest** - Ambiguous by nature
5. **Limited data** - Only 2,647 training samples