# **Kenza Bouqdir - LAB05**

In [1]:
import pandas as pd
import spacy
import bisect
from nltk.tree import Tree
import numpy as np
import warnings
import os
import requests
from io import StringIO
import re

In [2]:
# Install necessary packages if not available
try:
    import transformers
    transformers_available = True
except ImportError:
    print("Installing transformers library...")
    !pip install -q transformers
    import transformers
    transformers_available = True

In [3]:
# Install AllenNLP and necessary dependencies
try:
    import allennlp
    from allennlp.predictors import Predictor
    from allennlp.data.tokenizers import SpacyTokenizer
    allennlp_available = True
except ImportError:
    print("Installing AllenNLP...")
    !pip install -q allennlp==2.10.1 allennlp-models==2.10.1
    try:
        import allennlp
        from allennlp.predictors import Predictor
        from allennlp.data.tokenizers import SpacyTokenizer
        allennlp_available = True
    except ImportError:
        print("Failed to install AllenNLP. Using basic coreference resolution.")
        allennlp_available = False

Installing AllenNLP...
[31mERROR: Ignored the following versions that require a different python version: 0.2.0 Requires-Python ==3.6[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement torch<1.13.0,>=1.10.0 (from allennlp) (from versions: 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch<1.13.0,>=1.10.0[0m[31m
[0mFailed to install AllenNLP. Using basic coreference resolution.


In [4]:
# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model not found. Downloading now...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [5]:
# GAP dataset URLs - using direct URLs instead of relying on local files
GAP_URLS = {
    'training': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv',
    'testing': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv',
    'validation': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv'
}

In [6]:
def download_gap_dataset(url):
    """
    Download GAP dataset from GitHub if local files are unavailable
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if download succeeded
        return pd.read_csv(StringIO(response.text), sep='\t')
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        return create_sample_dataset()

In [7]:
def load_datasets():
    """
    Load the GAP datasets for coreference resolution tasks.
    Returns training, testing, and validation dataframes.
    """
    print("Loading datasets...")
    datasets = {}

    # Try loading from local files first
    for name, url in GAP_URLS.items():
        local_path = f'GapData/gap-{name.split("_")[0]}.tsv'
        try:
            # Try local file first
            datasets[name] = pd.read_csv(local_path, sep='\t')
            print(f"{name.capitalize()} dataset loaded from local file. Shape: {datasets[name].shape}")
        except FileNotFoundError:
            # If not found, download from GitHub
            print(f"Downloading {name} dataset from GitHub...")
            datasets[name] = download_gap_dataset(url)
            print(f"{name.capitalize()} dataset shape: {datasets[name].shape}")

    return datasets['training'], datasets['testing'], datasets['validation']


In [8]:
def create_sample_dataset():
    """Create a small sample dataset for demonstration when files are not available"""
    print("Creating sample dataset for demonstration...")
    data = {
        'ID': [f'sample-{i}' for i in range(1, 11)],
        'Text': [
            'John saw Mike at the store. He was buying groceries.',
            'Mary met Susan after school. She had a new book to share.',
            'The professor praised the student because he had solved the difficult problem.',
            'Sarah and Rebecca went to the park. She brought a frisbee.',
            'The lawyer consulted with the client before she presented the case.',
            'The doctor told the patient that he needed to rest.',
            'When Tom met Jake at the conference, he was very excited.',
            'The mother told her daughter that she should study more.',
            'After the meeting, the manager asked the assistant if she could prepare the report.',
            'The cat chased the mouse until it escaped under the sofa.'
        ],
        'Pronoun': ['He', 'She', 'he', 'She', 'she', 'he', 'he', 'she', 'she', 'it'],
        'Pronoun-offset': [25, 33, 47, 39, 48, 37, 45, 42, 70, 31],
        'A': ['John', 'Mary', 'professor', 'Sarah', 'lawyer', 'doctor', 'Tom', 'mother', 'manager', 'cat'],
        'A-offset': [0, 0, 4, 0, 4, 4, 5, 4, 19, 4],
        'A-coref': [True, True, True, True, True, False, True, False, True, False],
        'B': ['Mike', 'Susan', 'student', 'Rebecca', 'client', 'patient', 'Jake', 'daughter', 'assistant', 'mouse'],
        'B-offset': [9, 11, 24, 10, 26, 16, 15, 20, 36, 14],
        'B-coref': [False, False, False, False, False, True, False, True, False, True],
        'URL': [''] * 10
    }
    return pd.DataFrame(data)

In [9]:
def map_offset_to_token(tokens, offset):
    """
    Maps a character offset to its token index.

    Args:
        tokens: List of SpaCy tokens
        offset: Character offset to map

    Returns:
        Index of the token containing the offset, or None if not found
    """
    # Create a list of token start positions
    starts = [token.idx for token in tokens]

    if not starts or offset < starts[0]:
        return None

    # Use binary search to find the token
    position = bisect.bisect_right(starts, offset) - 1

    # Verify the offset is within the token's range
    if position >= 0 and position < len(tokens):
        token = tokens[position]
        if token.idx <= offset < token.idx + len(token.text):
            return position

    return None


In [10]:
# Extract preceding tokens
def extract_preceding(tokens, offset, k):
    """
    Extract tokens that appear before the current position.

    Args:
        tokens: List of tokens
        offset: Current token position
        k: Number of tokens to extract

    Returns:
        List of preceding tokens, padded with None if needed
    """
    start = max(0, offset - k)
    preceding = tokens[start:offset]
    return [None] * (k - len(preceding)) + preceding

In [11]:
# Extract following tokens
def extract_following(tokens, offset, k):
    """
    Extract tokens that appear after the current position.

    Args:
        tokens: List of tokens
        offset: Current token position
        k: Number of tokens to extract

    Returns:
        List of following tokens, padded with None if needed
    """
    following = tokens[offset:offset + k]
    return following + [None] * (k - len(following))


In [12]:
# Convert SpaCy dependency tree to NLTK Tree
def spacy_to_nltk_tree(token):
    """
    Recursively converts a SpaCy dependency tree to NLTK Tree format.

    Args:
        token: SpaCy token

    Returns:
        NLTK Tree
    """
    if list(token.children):
        return Tree(f"{token.dep_}:{token.text}", [spacy_to_nltk_tree(child) for child in token.children])
    else:
        return token.text

In [13]:
# Generate a parse tree visualization
def visualize_parse_tree(text):
    """
    Generate and display an NLTK parse tree for a text.

    Args:
        text: Text to parse
    """
    doc = nlp(text)
    print(f"Parse tree for: '{text}'")

    for sent in doc.sents:
        # Find the root
        root = None
        for token in sent:
            if token.dep_ == "ROOT":
                root = token
                break

        if root:
            tree = spacy_to_nltk_tree(root)
            print(tree)
        else:
            print("Could not find root in sentence.")


In [14]:
# Extract comprehensive linguistic features
def extract_features(text, char_offset):
    """
    Extract linguistic features for a token at the given character offset.

    Args:
        text: Input text
        char_offset: Character offset of the token of interest

    Returns:
        Dictionary of features
    """
    doc = nlp(text)
    token_idx = map_offset_to_token(list(doc), char_offset)

    if token_idx is None:
        print(f"Warning: Could not find token at offset {char_offset} in text: '{text}'")
        return None

    # Get token and its sentence
    token = doc[token_idx]
    sentence = None
    for sent in doc.sents:
        if sent.start <= token_idx < sent.end:
            sentence = sent
            break

    if not sentence:
        print(f"Warning: Could not find sentence containing token at position {token_idx}")
        return None

    # Calculate position in sentence
    position_in_sentence = token_idx - sentence.start
    sentence_tokens = [t.text for t in sentence]

    # Extract features
    features = {
        "mention": token.text,
        "head": token.head.text,
        "head_pos": token.head.pos_,
        "sentence_first": sentence[0].text,
        "sentence_last": sentence[-1].text,
        "preceding_2": extract_preceding(sentence_tokens, position_in_sentence, 2),
        "preceding_5": extract_preceding(sentence_tokens, position_in_sentence, 5),
        "following_2": extract_following(sentence_tokens, position_in_sentence + 1, 2),
        "following_5": extract_following(sentence_tokens, position_in_sentence + 1, 5),
        "all_tokens": sentence_tokens,
        "pos": token.pos_,
        "dep": token.dep_,
        "is_pronoun": token.pos_ == "PRON",
        "is_entity": token.ent_type_ != "",
        "entity_type": token.ent_type_ if token.ent_type_ else None
    }

    return features


In [15]:
# AllenNLP-based coreference resolution
def predict_with_allennlp(text, pronoun_offset, entity_a_offset, entity_b_offset):
    """
    Use AllenNLP for coreference resolution.

    Args:
        text: Input text
        pronoun_offset: Character offset of the pronoun
        entity_a_offset: Character offset of entity A
        entity_b_offset: Character offset of entity B

    Returns:
        'A', 'B', or None depending on coreference
    """
    # Load the pretrained model (will download if not present)
    try:
        predictor = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz"
        )

        # Run the model to get clusters
        result = predictor.predict(document=text)
        clusters = result.get("clusters", [])

        # Process the document with SpaCy for character-to-token mapping
        doc = nlp(text)

        # Find token indices for offsets
        pronoun_idx = map_offset_to_token(list(doc), pronoun_offset)
        entity_a_idx = map_offset_to_token(list(doc), entity_a_offset)
        entity_b_idx = map_offset_to_token(list(doc), entity_b_offset)

        if None in (pronoun_idx, entity_a_idx, entity_b_idx):
            print("Warning: Could not map all offsets to tokens")
            return None

        # Find which cluster contains our pronoun
        pronoun_cluster = None
        for cluster in clusters:
            for span in cluster:
                if span[0] <= pronoun_idx <= span[1]:
                    pronoun_cluster = cluster
                    break
            if pronoun_cluster:
                break

        if not pronoun_cluster:
            return None

        # Check if entity A or B is in the same cluster
        a_in_cluster = any(span[0] <= entity_a_idx <= span[1] for span in pronoun_cluster)
        b_in_cluster = any(span[0] <= entity_b_idx <= span[1] for span in pronoun_cluster)

        if a_in_cluster and not b_in_cluster:
            return "A"
        elif b_in_cluster and not a_in_cluster:
            return "B"
        elif a_in_cluster and b_in_cluster:
            # Both in cluster, use distance heuristic
            return "A" if abs(pronoun_idx - entity_a_idx) < abs(pronoun_idx - entity_b_idx) else "B"
        else:
            return None

    except Exception as e:
        print(f"Error using AllenNLP: {e}")
        return None


In [16]:
# Feature-based coreference resolution as fallback
def resolve_with_features(text, pronoun_offset, entity_a_offset, entity_b_offset):
    """
    Rule-based approach using SpaCy features when AllenNLP is unavailable.

    Args:
        text: Input text
        pronoun_offset: Character offset of pronoun
        entity_a_offset: Character offset of entity A
        entity_b_offset: Character offset of entity B

    Returns:
        'A', 'B', or None based on rule-based analysis
    """
    # Extract features for each mention
    p_features = extract_features(text, pronoun_offset)
    a_features = extract_features(text, entity_a_offset)
    b_features = extract_features(text, entity_b_offset)

    if not all([p_features, a_features, b_features]):
        print("Warning: Could not extract features for all mentions")
        return None

    doc = nlp(text)
    p_token = None
    a_token = None
    b_token = None

    for token in doc:
        if token.idx == pronoun_offset:
            p_token = token
        elif token.idx == entity_a_offset:
            a_token = token
        elif token.idx == entity_b_offset:
            b_token = token

    # Get token indices
    token_list = list(doc)
    p_idx = token_list.index(p_token) if p_token in token_list else -1
    a_idx = token_list.index(a_token) if a_token in token_list else -1
    b_idx = token_list.index(b_token) if b_token in token_list else -1

    if -1 in (p_idx, a_idx, b_idx):
        print("Warning: Could not find all tokens in document")
        return None

    # Rule 1: Gender agreement (for gendered pronouns)
    pronoun_lower = p_features["mention"].lower()

    # Check if both entities are in the same sentence as the pronoun
    same_sentence_a = any(sent.start <= p_idx < sent.end and sent.start <= a_idx < sent.end for sent in doc.sents)
    same_sentence_b = any(sent.start <= p_idx < sent.end and sent.start <= b_idx < sent.end for sent in doc.sents)

    # Rule 2: Syntactic role (subjects are more likely to be antecedents)
    a_is_subject = a_token.dep_ in ["nsubj", "nsubjpass"] if a_token else False
    b_is_subject = b_token.dep_ in ["nsubj", "nsubjpass"] if b_token else False

    # Rule 3: Distance (closer mentions are more likely to be antecedents)
    a_distance = abs(p_idx - a_idx)
    b_distance = abs(p_idx - b_idx)

    # Rule 4: Sentence recency (more recent sentences are preferred)
    a_sentence_idx = -1
    b_sentence_idx = -1
    p_sentence_idx = -1

    for i, sent in enumerate(doc.sents):
        if sent.start <= p_idx < sent.end:
            p_sentence_idx = i
        if sent.start <= a_idx < sent.end:
            a_sentence_idx = i
        if sent.start <= b_idx < sent.end:
            b_sentence_idx = i

    a_sentence_distance = abs(p_sentence_idx - a_sentence_idx)
    b_sentence_distance = abs(p_sentence_idx - b_sentence_idx)

    # Combine rules with weights
    score_a = 0
    score_b = 0

    # Syntax weight
    if a_is_subject:
        score_a += 2
    if b_is_subject:
        score_b += 2

    # Same sentence weight
    if same_sentence_a:
        score_a += 1.5
    if same_sentence_b:
        score_b += 1.5

    # Sentence distance weight (if not in same sentence)
    if not same_sentence_a:
        score_a -= a_sentence_distance * 0.5
    if not same_sentence_b:
        score_b -= b_sentence_distance * 0.5

    # Token distance weight (only if in same sentence)
    if same_sentence_a:
        score_a -= a_distance * 0.1
    if same_sentence_b:
        score_b -= b_distance * 0.1

    # Return the entity with the higher score
    if score_a > score_b:
        return "A"
    elif score_b > score_a:
        return "B"
    else:
        # Tiebreaker: Return closest entity
        return "A" if a_distance <= b_distance else "B"


In [17]:
# Main coreference resolution function
def resolve_coreference(text, pronoun_offset, entity_a_offset, entity_b_offset):
    """
    Resolve coreference using the best available method.

    Args:
        text: Input text
        pronoun_offset: Character offset of pronoun
        entity_a_offset: Character offset of entity A
        entity_b_offset: Character offset of entity B

    Returns:
        'A', 'B', or None depending on which entity the pronoun refers to
    """
    # Try AllenNLP first if available
    if allennlp_available:
        result = predict_with_allennlp(text, pronoun_offset, entity_a_offset, entity_b_offset)
        if result:
            return result
        print("AllenNLP coreference resolution failed, falling back to feature-based approach")

    # Fall back to feature-based approach
    return resolve_with_features(text, pronoun_offset, entity_a_offset, entity_b_offset)


In [18]:
# Evaluate coreference resolution
def evaluate_coref_resolution(dataset, max_samples=10):
    """
    Evaluate coreference resolution on the dataset.

    Args:
        dataset: DataFrame with coreference annotations
        max_samples: Maximum number of samples to evaluate

    Returns:
        Accuracy and detailed results
    """
    correct = 0
    count = 0
    results = []

    for idx, row in dataset.head(max_samples).iterrows():
        try:
            text = row["Text"]
            pronoun_offset = int(row["Pronoun-offset"])
            entity_a_offset = int(row["A-offset"])
            entity_b_offset = int(row["B-offset"])

            # Ground truth
            gold = "A" if row["A-coref"] else "B" if row["B-coref"] else None

            # Prediction
            pred = resolve_coreference(text, pronoun_offset, entity_a_offset, entity_b_offset)

            # Check if correct
            is_correct = (pred == gold)
            if is_correct:
                correct += 1
            count += 1

            # Store result for detailed analysis
            results.append({
                "id": row["ID"],
                "pronoun": row["Pronoun"],
                "entity_a": row["A"],
                "entity_b": row["B"],
                "gold": gold,
                "pred": pred,
                "correct": is_correct
            })

        except Exception as e:
            print(f"Error processing example {idx}: {e}")

    # Calculate accuracy
    accuracy = (correct / count) * 100 if count > 0 else 0
    print(f"Accuracy: {accuracy:.2f}% ({correct}/{count})")

    return accuracy, results

In [19]:
def main():
    print("Improved Coreference Resolution Lab")

    # Load datasets
    training_data, testing_data, validation_data = load_datasets()

    # Show a sample
    print("\nSample data:")
    print(training_data.head(3))

    # Test feature extraction
    print("\nTesting feature extraction:")
    example = "John saw Mary at the park yesterday. She waved at him."
    pronoun_offset = example.find("She")
    features = extract_features(example, pronoun_offset)

    if features:
        print(f"Text: {example}")
        print(f"Pronoun: {features['mention']}")
        print(f"Previous 2 tokens: {features['preceding_2']}")
        print(f"Next 2 tokens: {features['following_2']}")
        print(f"Full sentence: {' '.join(features['all_tokens'])}")

    # Visualize parse tree
    print("\nVisualizing parse tree:")
    sentence = "John gave Mary a book because he thought she would enjoy it."
    visualize_parse_tree(sentence)

    # Test coreference resolution with a simple example
    print("\nTesting coreference resolution:")
    test_example = "John met Mike at the store. He was buying groceries."
    test_pronoun_offset = test_example.find("He")
    test_john_offset = test_example.find("John")
    test_mike_offset = test_example.find("Mike")

    test_result = resolve_coreference(test_example, test_pronoun_offset, test_john_offset, test_mike_offset)
    print(f"Example: {test_example}")
    print(f"Prediction: 'He' refers to {'John' if test_result == 'A' else 'Mike' if test_result == 'B' else 'neither'}")

    # Evaluate on test dataset
    print("\nEvaluating on test dataset (first 10 samples)...")
    accuracy, results = evaluate_coref_resolution(testing_data, max_samples=10)

    # Print detailed results
    print("\nDetailed results:")
    for i, res in enumerate(results):
        print(f"{i+1}. {res['pronoun']} refers to {res['entity_a'] if res['pred'] == 'A' else res['entity_b'] if res['pred'] == 'B' else 'neither'}")
        print(f"   Gold: {res['entity_a'] if res['gold'] == 'A' else res['entity_b'] if res['gold'] == 'B' else 'neither'}")
        print(f"   Result: {'✓' if res['correct'] else '✗'}")

    # Report on library availability
    print("\nLibrary status:")
    print(f"- AllenNLP available: {allennlp_available}")
    print(f"- Transformers available: {transformers_available}")

if __name__ == "__main__":
    main()

Improved Coreference Resolution Lab
Loading datasets...
Downloading training dataset from GitHub...
Training dataset shape: (2000, 11)
Downloading testing dataset from GitHub...
Testing dataset shape: (2000, 11)
Downloading validation dataset from GitHub...
Validation dataset shape: (454, 11)

Sample data:
       ID                                               Text Pronoun  \
0  test-1  Upon their acceptance into the Kontinental Hoc...     His   
1  test-2  Between the years 1979-1981, River won four lo...     him   
2  test-3  Though his emigration from the country has aff...      He   

   Pronoun-offset             A  A-offset  A-coref                   B  \
0             383     Bob Suter       352    False              Dehner   
1             430        Alonso       353     True  Alfredo Di St*fano   
2             312  Ali Aladhadh       256     True              Saddam   

   B-offset  B-coref                                           URL  
0       366     True    http://en.wik