# eBay NER Dataset - Exploratory Data Analysis

This notebook provides comprehensive analysis of the eBay German NER dataset to inform preprocessing decisions.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import sys
import os
from pathlib import Path

# Add src to path for imports
sys.path.append('../src')
from data.load_data import read_tagged_train, read_listings, to_bio_sequences, load_label_list, build_bio_maps

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)


ModuleNotFoundError: No module named 'src.data.load_data'

## 1. Load and Basic Dataset Statistics


In [None]:
# Load training data
tagged_df = read_tagged_train('../data/Tagged_Titles_Train.tsv.gz')
listings_df = read_listings('../data/Listing_Titles.tsv.gz')

print("=== Dataset Overview ===")
print(f"Tagged training records: {tagged_df['Record Number'].nunique()}")
print(f"Total tokens in training: {len(tagged_df)}")
print(f"Listings records: {len(listings_df)}")
print(f"Categories in training: {tagged_df['Category'].nunique()}")
print(f"Unique tokens: {tagged_df['Token'].nunique()}")

print("\n=== Training Data Sample ===")
print(tagged_df.head(10))


## 2. Title Length Analysis


In [None]:
# Analyze title lengths
title_lengths = tagged_df.groupby('Record Number')['Token'].count()

print("=== Title Length Statistics ===")
print(f"Mean tokens per title: {title_lengths.mean():.2f}")
print(f"Median tokens per title: {title_lengths.median():.2f}")
print(f"Max tokens per title: {title_lengths.max()}")
print(f"Min tokens per title: {title_lengths.min()}")
print(f"95th percentile: {title_lengths.quantile(0.95):.2f}")
print(f"99th percentile: {title_lengths.quantile(0.99):.2f}")

# Plot title length distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.hist(title_lengths, bins=50, alpha=0.7, edgecolor='black')
ax1.set_xlabel('Tokens per Title')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Title Lengths')
ax1.axvline(title_lengths.mean(), color='red', linestyle='--', label=f'Mean: {title_lengths.mean():.1f}')
ax1.axvline(160, color='orange', linestyle='--', label='Config Max: 160')
ax1.legend()

# Box plot
ax2.boxplot(title_lengths)
ax2.set_ylabel('Tokens per Title')
ax2.set_title('Title Length Box Plot')
ax2.axhline(160, color='orange', linestyle='--', label='Config Max: 160')
ax2.legend()

plt.tight_layout()
plt.show()

# Titles that exceed max_length
long_titles = title_lengths[title_lengths > 160]
print(f"\nTitles exceeding max_length (160): {len(long_titles)} ({len(long_titles)/len(title_lengths)*100:.2f}%)")
if len(long_titles) > 0:
    print(f"Longest title: {long_titles.max()} tokens")


## 3. Label Distribution Analysis


In [None]:
# Load labels and create BIO mapping
labels = load_label_list('../configs/labels.txt')
bio_labels, id2label, label2id = build_bio_maps(labels)

print(f"=== Label Analysis ===")
print(f"Original labels: {len(labels)}")
print(f"BIO labels: {len(bio_labels)}")
print(f"Labels: {labels}")

# Analyze tag distribution
tag_counts = tagged_df['Tag'].value_counts()
print(f"\n=== Tag Distribution ===")
print(f"Most common tags:")
print(tag_counts.head(10))

print(f"\nLeast common tags:")
print(tag_counts.tail(10))

# Plot tag distribution
plt.figure(figsize=(15, 8))
tag_counts.plot(kind='bar')
plt.title('Distribution of Tags in Training Data')
plt.xlabel('Tag')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Calculate label imbalance
total_tokens = len(tagged_df)
o_count = tag_counts.get('O', 0)
non_o_count = total_tokens - o_count

print(f"\n=== Label Imbalance ===")
print(f"O (Outside) tokens: {o_count} ({o_count/total_tokens*100:.2f}%)")
print(f"Named entity tokens: {non_o_count} ({non_o_count/total_tokens*100:.2f}%)")
print(f"Imbalance ratio: {o_count/non_o_count:.2f}:1")


## 4. Category Analysis


In [None]:
# Analyze category distribution
category_counts = tagged_df.groupby('Record Number')['Category'].first().value_counts()

print("=== Category Distribution ===")
print(category_counts)

# Plot category distribution
plt.figure(figsize=(10, 6))
category_counts.plot(kind='bar')
plt.title('Distribution of Categories')
plt.xlabel('Category ID')
plt.ylabel('Number of Records')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Analyze label distribution by category
print("\n=== Labels by Category ===")
for cat in sorted(tagged_df['Category'].unique()):
    cat_data = tagged_df[tagged_df['Category'] == cat]
    cat_tags = cat_data['Tag'].value_counts()
    print(f"\nCategory {cat} (n={len(cat_data.groupby('Record Number'))} records):")
    print(cat_tags.head(5))


## 5. Character and Token Analysis


In [None]:
# Character-level analysis
all_tokens = tagged_df['Token'].tolist()
all_text = ' '.join(all_tokens)

print("=== Character Analysis ===")
print(f"Total characters: {len(all_text)}")
print(f"Unique characters: {len(set(all_text))}")
print(f"German umlauts (äöüß): {sum(1 for c in all_text if c in 'äöüß')}")
print(f"Numbers: {sum(1 for c in all_text if c.isdigit())}")
print(f"Special chars: {sum(1 for c in all_text if not c.isalnum() and c not in 'äöüß')}")

# Token length analysis
token_lengths = [len(token) for token in all_tokens]
print(f"\n=== Token Length Analysis ===")
print(f"Mean token length: {np.mean(token_lengths):.2f}")
print(f"Max token length: {max(token_lengths)}")
print(f"Tokens with special chars: {sum(1 for t in all_tokens if not t.replace('-', '').replace('_', '').isalnum())}")

# Plot token length distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(token_lengths, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Characters per Token')
plt.ylabel('Frequency')
plt.title('Token Length Distribution')

plt.subplot(1, 2, 2)
plt.hist(token_lengths, bins=30, alpha=0.7, edgecolor='black', cumulative=True, density=True)
plt.xlabel('Characters per Token')
plt.ylabel('Cumulative Probability')
plt.title('Token Length CDF')

plt.tight_layout()
plt.show()


## 6. BIO Sequence Analysis


In [None]:
# Convert to BIO sequences for analysis
bio_sequences = to_bio_sequences(tagged_df)

print(f"=== BIO Sequence Analysis ===")
print(f"Total sequences: {len(bio_sequences)}")

# Analyze sequence lengths
seq_lengths = [len(seq['tokens']) for seq in bio_sequences]
print(f"Mean sequence length: {np.mean(seq_lengths):.2f}")
print(f"Max sequence length: {max(seq_lengths)}")

# Analyze label patterns
all_bio_labels = []
for seq in bio_sequences:
    all_bio_labels.extend(seq['bio_labels'])

bio_counts = Counter(all_bio_labels)
print(f"\n=== BIO Label Distribution ===")
print(f"Most common BIO labels:")
for label, count in bio_counts.most_common(10):
    print(f"  {label}: {count}")

# Analyze entity spans
entity_spans = []
for seq in bio_sequences:
    tokens = seq['tokens']
    labels = seq['bio_labels']
    
    current_span = []
    for i, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith('B-'):
            if current_span:
                entity_spans.append(len(current_span))
            current_span = [token]
        elif label.startswith('I-'):
            current_span.append(token)
        else:  # O
            if current_span:
                entity_spans.append(len(current_span))
                current_span = []
    if current_span:
        entity_spans.append(len(current_span))

if entity_spans:
    print(f"\n=== Entity Span Analysis ===")
    print(f"Total entity spans: {len(entity_spans)}")
    print(f"Mean span length: {np.mean(entity_spans):.2f}")
    print(f"Max span length: {max(entity_spans)}")
    print(f"Spans of length 1: {sum(1 for s in entity_spans if s == 1)} ({sum(1 for s in entity_spans if s == 1)/len(entity_spans)*100:.1f}%)")
    print(f"Spans of length >5: {sum(1 for s in entity_spans if s > 5)} ({sum(1 for s in entity_spans if s > 5)/len(entity_spans)*100:.1f}%)")


## 7. Data Quality Assessment


In [None]:
# Check for data quality issues
print("=== Data Quality Assessment ===")

# Missing values
print(f"Missing values per column:")
print(tagged_df.isnull().sum())

# Empty tokens
empty_tokens = tagged_df['Token'].str.strip() == ''
print(f"\nEmpty tokens: {empty_tokens.sum()}")

# Empty tags (continuation tags)
empty_tags = tagged_df['Tag'].str.strip() == ''
print(f"Empty tags (continuation): {empty_tags.sum()}")

# Inconsistent record lengths
record_lengths = tagged_df.groupby('Record Number').size()
inconsistent_records = record_lengths[record_lengths != record_lengths.mode().iloc[0]]
print(f"\nRecords with inconsistent token counts: {len(inconsistent_records)}")

# Check for potential annotation errors
print(f"\n=== Potential Issues ===")
print(f"Records with no named entities: {sum(1 for seq in bio_sequences if all(label == 'O' for label in seq['bio_labels']))}")
print(f"Records with only named entities: {sum(1 for seq in bio_sequences if all(label != 'O' for label in seq['bio_labels']))}")

# Sample problematic cases
print(f"\n=== Sample Records ===")
for i, seq in enumerate(bio_sequences[:3]):
    print(f"\nRecord {i+1}:")
    print(f"  Tokens: {seq['tokens'][:10]}{'...' if len(seq['tokens']) > 10 else ''}")
    print(f"  Labels: {seq['bio_labels'][:10]}{'...' if len(seq['bio_labels']) > 10 else ''}")
    print(f"  Length: {len(seq['tokens'])}")


## 8. Preprocessing Recommendations


In [None]:
print("=== Preprocessing Recommendations ===")

# Max length recommendation
p95_length = title_lengths.quantile(0.95)
p99_length = title_lengths.quantile(0.99)
config_max = 160

print(f"Current config max_length: {config_max}")
print(f"95th percentile: {p95_length:.1f}")
print(f"99th percentile: {p99_length:.1f}")
print(f"Titles exceeding config: {sum(title_lengths > config_max)} ({sum(title_lengths > config_max)/len(title_lengths)*100:.2f}%)")

if p95_length <= config_max:
    print(f"✓ Config max_length={config_max} covers 95% of data")
else:
    print(f"⚠ Consider increasing max_length to {int(p95_length)} to cover 95% of data")

# Label imbalance
imbalance_ratio = o_count / non_o_count
print(f"\nLabel imbalance ratio: {imbalance_ratio:.2f}:1 (O:NE)")
if imbalance_ratio > 10:
    print("⚠ High label imbalance - consider class weights")
else:
    print("✓ Label imbalance is manageable")

# Rare labels
rare_labels = tag_counts[tag_counts < 10]
print(f"\nRare labels (<10 occurrences): {len(rare_labels)}")
if len(rare_labels) > 0:
    print(f"Rare labels: {list(rare_labels.index)}")
    print("⚠ Consider grouping rare labels or using class weights")

# Train/val split recommendation
total_records = len(bio_sequences)
print(f"\nTotal records for training: {total_records}")
print(f"Recommended train/val split: {int(total_records * 0.9)}/{int(total_records * 0.1)}")
print(f"Records per category in validation: ~{int(total_records * 0.1 / len(category_counts))}")

print(f"\n=== Summary ===")
print(f"✓ Dataset size: {total_records} records, {len(tagged_df)} tokens")
print(f"✓ Label vocabulary: {len(bio_labels)} BIO labels")
print(f"✓ Max sequence length: {max(seq_lengths)} tokens")
print(f"✓ Categories: {len(category_counts)} (balanced: {len(category_counts) == 2})")
print(f"✓ Ready for preprocessing pipeline")
