In [45]:
# Install nest_asyncio if not already installed
# !pip install nest_asyncio

import nest_asyncio
nest_asyncio.apply()

from wikipedia_histories import get_history
from wikipedia_histories import to_df
import pandas as pd
import io
import csv

article_title = "Refugiados"
history = get_history(article_title, domain="de.wikipedia.org")

def fix_unterminated_quotes(text):
    """Fix unterminated quotes in CSV data."""
    # Count quotes in each line
    lines = text.split('\n')
    fixed_lines = []
    
    for line in lines:
        # Count non-escaped quotes
        quote_count = line.count('"') - line.count('\\"')
        
        # If odd number of quotes, add a closing quote
        if quote_count % 2 != 0:
            line += '"'
        
        fixed_lines.append(line)
    
    return '\n'.join(fixed_lines)

def preprocess_history_data(history_data):
    """Preprocess history data to fix common CSV parsing issues."""
    # Convert to string format if it's a list of objects
    if isinstance(history_data, list):
        # Join with newlines to create a CSV-like string
        history_str = "\n".join([str(entry) for entry in history_data])
    else:
        history_str = str(history_data)
    
    # Fix quote issues
    history_str = history_str.replace('""', '\\"')  # Handle escaped quotes
    history_str = re.sub(r'(?<!")("(?!")|(?<!\\)")', r'""', history_str)  # Properly escape quotes
    
    # Fix unterminated quotes
    history_str = fix_unterminated_quotes(history_str)
    
    return history_str

try:
    # Try with default settings
    print("Attempting default parsing...")
    history_df = to_df(history)
    print("Default parsing succeeded")
except Exception as e:
    print(f"Error with default parsing: {e}")
    
    try:
        # Try with custom parameters
        print("Attempting custom parsing...")
        history_df = to_df(history, quoting=csv.QUOTE_ALL, escapechar='\\', doublequote=True)
    except Exception as e:
        print(f"Error with custom parsing: {e}")
        
        try:
            # Preprocess the data to handle quote issues
            print("Preprocessing and parsing data...")
            history_str = preprocess_history_data(history)
            
            # Try parsing with more flexible options
            history_df = pd.read_csv(
                io.StringIO(history_str),
                quoting=csv.QUOTE_NONE,  # Try with no quoting
                escapechar='\\',
                doublequote=False,
                on_bad_lines='warn'  # Warn but don't fail on bad lines
            )
        except Exception as e:
            print(f"Advanced parsing failed: {e}")
            
            # Try one more approach - directly extract the data
            try:
                print("Attempting direct extraction...")
                # Extract data directly from the history objects
                data = []
                for item in history:
                    try:
                        # Assuming each history item has these attributes
                        # Adjust these based on the actual structure
                        entry = {
                            'revid': getattr(item, 'revid', ''),
                            'timestamp': getattr(item, 'timestamp', ''),
                            'user': getattr(item, 'user', ''),
                            'comment': getattr(item, 'comment', '')
                        }
                        data.append(entry)
                    except Exception as inner_e:
                        print(f"Error extracting item: {inner_e}")
                
                history_df = pd.DataFrame(data)
            except Exception as e:
                print(f"All parsing methods failed: {e}")
                # Create empty DataFrame with expected columns as last resort
                history_df = pd.DataFrame(columns=["revid", "timestamp", "user", "comment"])

# Use the dataframe (renamed to avoid confusion)
history_table = history_df
print(f"Successfully created DataFrame with {len(history_table)} rows")

Attempting default parsing...
Default parsing succeeded
Successfully created DataFrame with 33 rows


In [None]:
# history_table["text"] = history_table["text"].apply(lambda x: x if isinstance(x, str) else "")

In [46]:
history_table

Unnamed: 0,title,time,revid,kind,user,comment,rating,text
0,Refugiados,2025-03-26 12:13:18,254558729,False,Rüdiger Überall,[[Hilfe:Zusammenfassung und Quellen#Auto-Zusam...,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
1,Refugiados,2025-03-26 12:34:19,254559326,False,Rüdiger Überall,,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
2,Refugiados,2025-03-26 12:35:33,254559367,True,Bahnmoeller,Bahnmoeller verschob die Seite [[Flüchtlinge (...,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
3,Refugiados,2025-03-26 12:38:37,254559440,False,Bahnmoeller,/* Beschreibung */,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
4,Refugiados,2025-03-26 13:36:35,254561071,True,Invisigoth67,form,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
5,Refugiados,2025-03-26 15:36:47,254564789,False,Rüdiger Überall,/* Beschreibung */,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
6,Refugiados,2025-03-26 16:26:28,254566384,False,Rüdiger Überall,,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
7,Refugiados,2025-03-26 16:36:28,254566674,False,Bahnmoeller,,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
8,Refugiados,2025-03-26 19:51:46,254571173,True,Aka,/* Beschreibung */ Leerzeichen nach Punkt eing...,,Refugiados (spanisch: Flüchtlinge) ist ein Den...
9,Refugiados,2025-03-27 09:15:11,254586768,False,Rüdiger Überall,/* Beschreibung */ Prototyp u. endgültige Vers...,,Refugiados (spanisch: Flüchtlinge) ist ein Den...


In [None]:
# Extract the text content for each revision from history_table
import difflib
import pickle
import os
import html
from tqdm import tqdm

# Add installation commands for required packages
# Uncomment and run these lines if you're getting IProgress errors
# !pip install --upgrade jupyter
# !pip install --upgrade ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

# Create a dictionary to store revision texts
revision_texts = {}

print("Extracting text content for each revision...")
# Loop through the revision IDs and get text from the dataframe
for index, row in tqdm(history_table.iterrows(), total=len(history_table)):
    rev_id = row['revid']
    try:
        # Get the text content from the 'text' column
        if 'text' in row and pd.notna(row['text']):
            revision_texts[rev_id] = row['text']
        else:
            print(f"No text available for revision {rev_id}")
            revision_texts[rev_id] = ""  # Store empty string for missing text
    except Exception as e:
        print(f"Error extracting text for revision {rev_id}: {e}")
        revision_texts[rev_id] = ""  # Store empty string for failed extractions

print(f"Successfully extracted text for {len(revision_texts)} revisions")

# Save the revision texts to avoid re-processing
cache_path = "revision_texts_cache.pkl"
with open(cache_path, "wb") as f:
    pickle.dump(revision_texts, f)
    
print(f"Saved revision texts to {cache_path}")


In [None]:
# Enhanced function to visualize Wikipedia versioning with inline deletion display
from IPython.display import HTML
import re
import hashlib
import colorsys
import html
import difflib  # Make sure difflib is imported

def visualize_wiki_versions_with_deletions(revision_indices, word_level=True, verbose=False):
    """
    Visualize Wikipedia versioning with each revision's contributions colored by revision ID,
    including inline strikethrough for deleted text.

    Parameters:
    revision_indices (list): List of indices in history_table to compare sequentially
    word_level (bool): If True, perform word-level diff instead of line-level
    verbose (bool): If True, print additional information about each revision

    Returns:
    None: Displays the colored HTML output
    """
    if len(revision_indices) < 2:
        print("Need at least two revisions to compare")
        return

    # Validate indices and get revision IDs
    rev_ids = []
    users = []
    timestamps = []

    # Make indices unique to avoid duplication issues
    revision_indices = list(dict.fromkeys(revision_indices))

    for idx in revision_indices:
        if 0 <= idx < len(history_table):
            rev_ids.append(history_table['revid'].iloc[idx])
            users.append(history_table['user'].iloc[idx])
            timestamps.append(history_table['time'].iloc[idx] if 'time' in history_table.columns else 'Unknown')
        else:
            print(f"Invalid index: {idx}")
            return

    # Define baseline_rev_id as the first revision and final_rev_id as the last revision
    baseline_rev_id = rev_ids[0]
    final_rev_id = rev_ids[-1]

    if verbose:
        for i, (rev_id, user, timestamp) in enumerate(zip(rev_ids, users, timestamps)):
            print(f"V{i+1}: Rev {rev_id} by {user} at {timestamp}")

    # Function to generate maximally distinct colors
    def generate_distinct_colors(n_colors):
        """Generate n distinct colors that are maximally separated in hue space."""
        colors = []
        for i in range(n_colors):
            # Use golden ratio conjugate for better distribution
            hue = (i * 0.618033988749895) % 1
            saturation = 0.7 + (i % 3) * 0.1  # Vary saturation slightly
            value = 0.85 + (i % 2) * 0.1  # Vary brightness slightly

            r, g, b = colorsys.hsv_to_rgb(hue, saturation, value)
            hex_color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255))
            colors.append(hex_color)
        return colors

    # Generate distinct colors based on number of revisions
    distinct_colors = generate_distinct_colors(len(rev_ids))

    # Create a mapping from revision IDs to colors
    rev_colors = {rev_id: color for rev_id, color in zip(rev_ids, distinct_colors)}

    # Get the revision texts
    revision_texts_dict = {}
    for i, rev_id in enumerate(rev_ids):
        try:
            # Try to get text from the dataframe first
            idx = revision_indices[i]
            if 'text' in history_table.columns and pd.notna(history_table['text'].iloc[idx]):
                revision_texts_dict[rev_id] = history_table['text'].iloc[idx]
            # Fall back to the revision_texts dictionary if available
            elif rev_id in revision_texts and revision_texts[rev_id]:
                revision_texts_dict[rev_id] = revision_texts[rev_id]
            else:
                print(f"Warning: No text available for revision {rev_id}")
                revision_texts_dict[rev_id] = ""
        except Exception as e:
            print(f"Error retrieving text for revision {rev_id}: {e}")
            revision_texts_dict[rev_id] = revision_texts[rev_id]

    # Collect deletions and their replacements for each revision pair
    all_replacements = []

    for i in range(1, len(rev_ids)):
        prev_rev_id = rev_ids[i-1]
        current_rev_id = rev_ids[i]

        prev_text = revision_texts_dict[prev_rev_id]
        current_text = revision_texts_dict[current_rev_id]

        # Use SequenceMatcher for more accurate diff
        if word_level:
            # Split into words for word-level diff
            prev_words = re.findall(r'\w+|\s+|[^\w\s]', prev_text)
            current_words = re.findall(r'\w+|\s+|[^\w\s]', current_text)

            # Find word-level differences
            matcher = difflib.SequenceMatcher(None, prev_words, current_words)

            for op, i1, i2, j1, j2 in matcher.get_opcodes():
                if op == 'replace':
                    # Text was replaced
                    deleted_content = ''.join(prev_words[i1:i2])
                    added_content = ''.join(current_words[j1:j2])
                    
                    all_replacements.append({
                        'deleted': deleted_content,
                        'added': added_content,
                        'prev_rev_id': prev_rev_id,
                        'current_rev_id': current_rev_id,
                        'position': j2  # Position after the added content
                    })
                elif op == 'delete':
                    # Text was deleted without replacement
                    deleted_content = ''.join(prev_words[i1:i2])
                    
                    all_replacements.append({
                        'deleted': deleted_content,
                        'added': '',
                        'prev_rev_id': prev_rev_id,
                        'current_rev_id': current_rev_id,
                        'position': j1  # Position where deletion occurred
                    })
        else:
            # Line-level diff
            prev_lines = prev_text.splitlines()
            current_lines = current_text.splitlines()

            differ = difflib.Differ()
            diff = list(differ.compare(prev_lines, current_lines))

            # Process line diffs to detect replacements
            i = 0
            while i < len(diff):
                line = diff[i]
                if line.startswith('- '):
                    deleted_content = line[2:]
                    
                    # Look ahead for potential replacement
                    if i+1 < len(diff) and diff[i+1].startswith('+ '):
                        added_content = diff[i+1][2:]
                        all_replacements.append({
                            'deleted': deleted_content,
                            'added': added_content,
                            'prev_rev_id': prev_rev_id,
                            'current_rev_id': current_rev_id,
                            'position': i  # Approximate position
                        })
                        i += 2  # Skip the next line as we've processed it
                    else:
                        # No replacement found
                        all_replacements.append({
                            'deleted': deleted_content,
                            'added': '',
                            'prev_rev_id': prev_rev_id,
                            'current_rev_id': current_rev_id,
                            'position': i  # Approximate position
                        })
                        i += 1
                else:
                    i += 1

    # Create token mappings for each revision
    token_attributions = []

    # Process first revision
    first_text = revision_texts_dict[baseline_rev_id]
    token_pattern = r'\w+|\s+|[^\w\s]' if word_level else r'.'
    first_tokens = re.findall(token_pattern, first_text)

    # Initialize with first revision - all tokens attributed to baseline
    token_attributions.append({
        'tokens': first_tokens,
        'attributions': [baseline_rev_id] * len(first_tokens)
    })

    # Process subsequent revisions
    for i in range(1, len(rev_ids)):
        prev_rev_id = rev_ids[i-1]
        current_rev_id = rev_ids[i]

        prev_tokens = token_attributions[-1]['tokens']
        prev_attributions = token_attributions[-1]['attributions']

        new_text = revision_texts_dict[current_rev_id]
        new_tokens = re.findall(token_pattern, new_text)

        # Use difflib's SequenceMatcher to find differences
        matcher = difflib.SequenceMatcher(None, prev_tokens, new_tokens)

        # Create new attribution information
        new_attributions = []

        # Process each diff operation
        for op, i1, i2, j1, j2 in matcher.get_opcodes():
            if op == 'equal':
                # Tokens unchanged - preserve attributions
                for k in range(j1, j2):
                    idx = k - j1 + i1
                    new_attributions.append(prev_attributions[idx])
            elif op == 'replace' or op == 'insert':
                # New tokens inserted by current revision
                for _ in range(j1, j2):
                    new_attributions.append(current_rev_id)
            # 'delete' operations are not reflected in new_tokens

        # Store this revision's information
        token_attributions.append({
            'tokens': new_tokens,
            'attributions': new_attributions
        })

    # Use the final revision's tokens and attributions
    final_attribution = token_attributions[-1]
    final_tokens = final_attribution['tokens']
    final_token_attributions = final_attribution['attributions']

    # Generate HTML for visualization
    html_content = """
    <style>
    .wiki-version {
        font-family: monospace;
        white-space: pre-wrap;
        line-height: 1.5;
        font-size: 14px;
        padding: 10px;
        border: 1px solid #ddd;
        border-radius: 5px;
        background-color: #fff;
    }
    .wiki-version span.text {
        border-radius: 2px;
        color: #000000;
    }
    .wiki-version span.text:hover {
        outline: 1px dotted #888;
    }
    .wiki-version span.deletion {
        text-decoration: line-through;
        border-radius: 2px;
        margin-right: 2px;
        color: #000000;
    }
    .revision-legend {
        margin-top: 15px;
        display: flex;
        flex-wrap: wrap;
        gap: 8px;
    }
    .revision-legend .item {
        display: flex;
        align-items: center;
        gap: 5px;
        border: 1px solid #ddd;
        padding: 3px 8px;
        border-radius: 3px;
    }
    .revision-legend .color {
        width: 15px;
        height: 15px;
        border-radius: 3px;
    }
    </style>
    <div class="wiki-version">
    """

    # Track positions for inserting deletions
    insertion_points = {}
    
    # Process by token to create spans for the visualization
    current_rev = None
    current_segment = ""
    result_html = ""
    html_position = 0
    text_position = 0
    
    # Map to track text positions to HTML positions
    position_mapping = {}

    # First, construct the basic text with attribution spans
    for token, attribution in zip(final_tokens, final_token_attributions):
        if attribution != current_rev:
            # Output any accumulated segment
            if current_segment:
                bg_color = "transparent"
                if current_rev != baseline_rev_id:  # Don't highlight the oldest revision
                    rev_color = rev_colors.get(current_rev, "#cccccc")
                    bg_color = f"rgba({int(rev_color[1:3], 16)}, {int(rev_color[3:5], 16)}, {int(rev_color[5:7], 16)}, 0.2)"

                span_html = f'<span class="text" style="color: #000000; background-color: {bg_color};" title="Added by Rev {current_rev}">{html.escape(current_segment)}</span>'
                
                # Record the position before adding the span
                position_mapping[text_position] = html_position
                
                result_html += span_html
                html_position += len(span_html)
                text_position += len(current_segment)
            
            current_rev = attribution
            current_segment = token
        else:
            current_segment += token

    # Add any remaining segment
    if current_segment:
        bg_color = "transparent"
        if current_rev != baseline_rev_id:  # Don't highlight the oldest revision
            rev_color = rev_colors.get(current_rev, "#cccccc")
            bg_color = f"rgba({int(rev_color[1:3], 16)}, {int(rev_color[3:5], 16)}, {int(rev_color[5:7], 16)}, 0.2)"

        span_html = f'<span class="text" style="color: #000000; background-color: {bg_color};" title="Added by Rev {current_rev}">{html.escape(current_segment)}</span>'
        
        # Record the position before adding the span
        position_mapping[text_position] = html_position
        
        result_html += span_html
        html_position += len(span_html)
        text_position += len(current_segment)

    # Final position mapping for the end of the text
    position_mapping[text_position] = html_position
    
    # Parse HTML to find insertion points for deletions
    final_html = result_html
    final_text = ''.join(final_tokens)
    
    # Add deletions after relevant content
    replacement_insertions = []
    
    for replacement in all_replacements:
        deleted_text = replacement['deleted']
        added_text = replacement['added']
        prev_rev = replacement['prev_rev_id']
        del_rev = replacement['current_rev_id']
        
        # Find position in the text where this added content is
        if added_text:
            pos = final_text.find(added_text)
            if pos >= 0:
                insertion_pos = pos + len(added_text)  # Position after the added content
            else:
                # If we can't find the exact added text, use an approximate position
                insertion_pos = min(replacement['position'], len(final_text))
        else:
            # For pure deletions, use the position directly
            insertion_pos = min(replacement['position'], len(final_text))
        
        # Find closest mapping point
        closest_pos = min(position_mapping.keys(), key=lambda x: abs(x - insertion_pos))
        html_insert_pos = position_mapping[closest_pos]
        
        # Prepare the deletion HTML with the color of the replacing revision
        del_color = rev_colors.get(del_rev, "#cccccc")
        deletion_html = f'<span class="deletion" style="text-decoration-color: {del_color};" title="Deleted by Rev {del_rev}">{html.escape(deleted_text)}</span>'
        
        replacement_insertions.append((html_insert_pos, deletion_html))
    
    # Sort insertions by position in reverse order so we don't affect earlier positions
    replacement_insertions.sort(key=lambda x: x[0], reverse=True)
    
    # Apply all insertions
    for pos, html_snippet in replacement_insertions:
        final_html = final_html[:pos] + html_snippet + final_html[pos:]

    html_content += final_html + '</div>'

    # Add a legend with unique entries
    html_content += '<div class="revision-legend">'

    # Create the legend entries
    for i, rev_id in enumerate(rev_ids):
        user = users[i]
        timestamp = timestamps[i]
        rev_color = rev_colors[rev_id]

        # Special styling for baseline
        if rev_id == baseline_rev_id:
            label = f"Rev {rev_id} ({user}) - {timestamp} - BASELINE"
            style = f"border: 2px solid {rev_color}"
        else:
            label = f"Rev {rev_id} ({user}) - {timestamp}"
            style = f"background-color: rgba({int(rev_color[1:3], 16)}, {int(rev_color[3:5], 16)}, {int(rev_color[5:7], 16)}, 0.2); border: 1px solid {rev_color}"

        html_content += f'''
        <div class="item">
            <div class="color" style="{style}"></div>
            <div>{label}</div>
        </div>
        '''

    html_content += '</div>'

    # Display the HTML
    display(HTML(html_content))
    
    if verbose:
        print(html_content)
        print("\nColor coding:")
        print("- Background color shows which revision added the text")
        print("- Strikethrough color shows which revision deleted the text")
        print("- Baseline text has no background color")
        print("- All text is black for better readability")
        print(f"\nFound {len(all_replacements)} deletions/replacements between revisions")

    return None


In [None]:
# Example usage - compare multiple sequential revisions with inline deletions
if len(history_table) >= 3:
    print("Visualizing multiple revisions with inline deletions:")
    visualize_wiki_versions_with_deletions(range(20, 28), verbose=False)

for i in [ 27, 28]:
    print(history_table.iloc[i]["text"])
