In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Download the complete dataset (both train and test)
newsgroups = fetch_20newsgroups(subset='all', remove=())

# Create DataFrame
df = pd.DataFrame({
    'document_id': range(len(newsgroups.data)),
    'text': newsgroups.data,
    'true_label': [newsgroups.target_names[label] for label in newsgroups.target]
})

print(f"Loaded {len(df)} documents")
print(f"\nDataset shape: {df.shape}")
print(f"\nCategories ({len(df['true_label'].unique())}): {sorted(df['true_label'].unique())}")
print(f"\nClass distribution:")
print(df['true_label'].value_counts().sort_index())
print(f"\nFirst few rows:")
print(df.head())
print(f"\nSample text from first document:")
print(df['text'].iloc[0][:500])  # First 500 characters

In [11]:
import re

print("="*70)
print("DATA EXPLORATION AND INCONSISTENCY CHECK")
print("="*70)

# 1. Basic statistics
print("\n1. BASIC STATISTICS:")
print(f"Total documents: {len(df)}")
print(f"Number of categories: {df['true_label'].nunique()}")
print(f"Categories: {sorted(df['true_label'].unique())}")

# 2. Check for null/empty values
print("\n2. NULL/EMPTY VALUES:")
print(df.isnull().sum())
print(f"Empty strings: {(df['text'].str.strip() == '').sum()}")

# 3. Text length distribution
print("\n3. TEXT LENGTH DISTRIBUTION:")
df['text_length'] = df['text'].str.len()
print(df['text_length'].describe())
print(f"Documents with < 50 characters: {(df['text_length'] < 50).sum()}")
print(f"Documents with < 100 characters: {(df['text_length'] < 100).sum()}")

# 4. Sample a few documents to see their structure
print("\n4. SAMPLE DOCUMENTS (First 3):")
for i in range(min(3, len(df))):
    print(f"\n--- Document {i} (Category: {df['true_label'].iloc[i]}) ---")
    print(f"Length: {df['text_length'].iloc[i]} characters")
    print(f"First 800 characters:")
    print(df['text'].iloc[i][:800])
    print("...")

# 5. Check for common patterns that need cleaning
print("\n5. PATTERNS DETECTED:")

# Check for email headers
has_headers = df['text'].str.contains(r'^(From|Subject|Organization):', case=False, regex=True, na=False)
print(f"Documents with email headers: {has_headers.sum()} ({has_headers.sum()/len(df)*100:.1f}%)")

# Check for quoted text
has_quotes = df['text'].str.contains(r'^>+', regex=True, flags=re.MULTILINE, na=False)
print(f"Documents with quoted text (>): {has_quotes.sum()} ({has_quotes.sum()/len(df)*100:.1f}%)")

# Check for email addresses
has_emails = df['text'].str.contains(r'\S+@\S+', regex=True, na=False)
print(f"Documents with email addresses: {has_emails.sum()} ({has_emails.sum()/len(df)*100:.1f}%)")

# Check for URLs
has_urls = df['text'].str.contains(r'http\S+|www\.\S+', regex=True, na=False)
print(f"Documents with URLs: {has_urls.sum()} ({has_urls.sum()/len(df)*100:.1f}%)")

# Check for excessive whitespace
has_excess_space = df['text'].str.contains(r'\s{3,}', regex=True, na=False)
print(f"Documents with excessive whitespace: {has_excess_space.sum()} ({has_excess_space.sum()/len(df)*100:.1f}%)")

# Check for special characters
has_special_chars = df['text'].str.contains(r'[^\w\s.,!?;:\'"()-]', regex=True, na=False)
print(f"Documents with special characters: {has_special_chars.sum()} ({has_special_chars.sum()/len(df)*100:.1f}%)")

# 6. Class balance
print("\n6. CLASS DISTRIBUTION:")
class_counts = df['true_label'].value_counts().sort_index()
print(class_counts)
print(f"\nMost common: {class_counts.max()} documents")
print(f"Least common: {class_counts.min()} documents")
print(f"Balance ratio (max/min): {class_counts.max()/class_counts.min():.2f}")

print("\n" + "="*70)
print("EXPLORATION COMPLETE")
print("="*70)

DATA EXPLORATION AND INCONSISTENCY CHECK

1. BASIC STATISTICS:
Total documents: 18846
Number of categories: 20
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

2. NULL/EMPTY VALUES:
document_id    0
text           0
true_label     0
dtype: int64
Empty strings: 0

3. TEXT LENGTH DISTRIBUTION:
count     18846.000000
mean       1902.525894
std        3984.970264
min         115.000000
25%         751.000000
50%        1175.000000
75%        1874.750000
max      160616.000000
Name: text_length, dtype: float64
Documents with < 50 characters: 0
Documents with < 100 characters: 0

4. SAMPLE DOCUMENTS (First 3):

--- Document 0 (Category: rec

  has_headers = df['text'].str.contains(r'^(From|Subject|Organization):', case=False, regex=True, na=False)


Documents with URLs: 3 (0.0%)
Documents with excessive whitespace: 15945 (84.6%)
Documents with special characters: 18843 (100.0%)

6. CLASS DISTRIBUTION:
true_label
alt.atheism                 799
comp.graphics               973
comp.os.ms-windows.misc     985
comp.sys.ibm.pc.hardware    982
comp.sys.mac.hardware       963
comp.windows.x              988
misc.forsale                975
rec.autos                   990
rec.motorcycles             996
rec.sport.baseball          994
rec.sport.hockey            999
sci.crypt                   991
sci.electronics             984
sci.med                     990
sci.space                   987
soc.religion.christian      997
talk.politics.guns          910
talk.politics.mideast       940
talk.politics.misc          775
talk.religion.misc          628
Name: count, dtype: int64

Most common: 999 documents
Least common: 628 documents
Balance ratio (max/min): 1.59

EXPLORATION COMPLETE


In [12]:
import re

def clean_newsgroup_text(text):
    """Clean newsgroup text based on identified patterns."""
    
    # 1. Remove email headers (lines starting with common patterns)
    # Common headers: From:, Subject:, Organization:, Lines:, NNTP-Posting-Host:, etc.
    lines = text.split('\n')
    cleaned_lines = []
    in_header = True
    
    for line in lines:
        # Skip header lines (lines with key: value format at the start)
        if in_header and re.match(r'^[\w-]+:', line):
            continue
        elif in_header and line.strip() == '':
            # Empty line often marks end of headers
            in_header = False
            continue
        else:
            in_header = False
            cleaned_lines.append(line)
    
    text = '\n'.join(cleaned_lines)
    
    # 2. Remove quoted text (lines starting with > or >>)
    text = re.sub(r'^>+.*$', '', text, flags=re.MULTILINE)
    
    # 3. Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # 4. Remove URLs (minimal presence but good to clean)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # 5. Convert to lowercase
    text = text.lower()
    
    # 6. Normalize whitespace (replace multiple spaces/newlines with single space)
    text = re.sub(r'\s+', ' ', text)
    
    # 7. Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# Apply cleaning
print("Applying cleaning to all documents...")
df['text_cleaned'] = df['text'].apply(clean_newsgroup_text)

# Check results
print("\n" + "="*70)
print("CLEANING RESULTS")
print("="*70)

# Calculate new text lengths
df['cleaned_length'] = df['text_cleaned'].str.len()

print("\nText length comparison:")
print("Original:")
print(df['text_length'].describe())
print("\nCleaned:")
print(df['cleaned_length'].describe())

# Check for very short documents after cleaning
print(f"\nDocuments with < 50 characters after cleaning: {(df['cleaned_length'] < 50).sum()}")
print(f"Documents with < 100 characters after cleaning: {(df['cleaned_length'] < 100).sum()}")

# Show before/after examples
print("\n" + "="*70)
print("BEFORE/AFTER EXAMPLES")
print("="*70)

for i in [0, 1, 2]:
    print(f"\n--- Document {i} (Category: {df['true_label'].iloc[i]}) ---")
    print(f"\nORIGINAL (first 400 chars):")
    print(df['text'].iloc[i][:400])
    print(f"\nCLEANED (first 400 chars):")
    print(df['text_cleaned'].iloc[i][:400])
    print(f"\nLength: {df['text_length'].iloc[i]} → {df['cleaned_length'].iloc[i]}")

# Filter out very short documents (optional - set threshold)
min_length = 50
df_final = df[df['cleaned_length'] >= min_length].copy()

# Reset document IDs and keep only necessary columns
df_final = df_final[['true_label', 'text_cleaned']].reset_index(drop=True)
df_final['document_id'] = range(len(df_final))
df_final = df_final[['document_id', 'text_cleaned', 'true_label']]
df_final.rename(columns={'text_cleaned': 'text'}, inplace=True)

print("\n" + "="*70)
print("FINAL DATASET")
print("="*70)
print(f"Original documents: {len(df)}")
print(f"After cleaning and filtering (min_length={min_length}): {len(df_final)}")
print(f"Removed: {len(df) - len(df_final)} documents")
print(f"\nFinal shape: {df_final.shape}")
print(f"\nSample of final dataset:")
print(df_final.head())

Applying cleaning to all documents...

CLEANING RESULTS

Text length comparison:
Original:
count     18846.000000
mean       1902.525894
std        3984.970264
min         115.000000
25%         751.000000
50%        1175.000000
75%        1874.750000
max      160616.000000
Name: text_length, dtype: float64

Cleaned:
count    18846.000000
mean      1262.492041
std       3367.285109
min          0.000000
25%        359.000000
50%        627.500000
75%       1143.750000
max      82312.000000
Name: cleaned_length, dtype: float64

Documents with < 50 characters after cleaning: 164
Documents with < 100 characters after cleaning: 533

BEFORE/AFTER EXAMPLES

--- Document 0 (Category: rec.sport.hockey) ---

ORIGINAL (first 400 chars):
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
