In [1]:
import pandas as pd
import string
from collections import Counter, defaultdict

def tokenize(sentence):
    """
    Remove punctuation, convert to lowercase, and split the sentence into tokens.
    Note: This removes characters like '<' and '>', so "<MASKED>" becomes "masked".
    """
    sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
    tokens = sentence.split()
    return tokens

def extract_contexts(tokens, window=2):
    """
    For each word in the tokenized sentence, extract its left and right contexts using a fixed window.
    Returns a list of tuples: (left_context, word, right_context).
    """
    contexts = []
    for i in range(len(tokens)):
        left_context = tokens[max(0, i - window):i]
        right_context = tokens[i+1:min(len(tokens), i + window + 1)]
        contexts.append((tuple(left_context), tokens[i], tuple(right_context)))
    return contexts

def build_context_dict(sentences, window=2):
    """
    Build a dictionary from the training sentences.
    Keys are tuples (left_context, right_context) and values are dictionaries mapping words to their frequency.
    """
    context_dict = {}
    for sentence in sentences:
        tokens = tokenize(sentence)
        for left_context, word, right_context in extract_contexts(tokens, window):
            key = (left_context, right_context)
            if key not in context_dict:
                context_dict[key] = {}
            context_dict[key][word] = context_dict[key].get(word, 0) + 1
    return context_dict

def build_left_index(context_dict):
    """
    Pre-compute an index mapping each token that appears in any left context
    to the set of context keys (tuple of left and right contexts) where it appears.
    """
    left_index = defaultdict(set)
    for key in context_dict.keys():
        left_context, _ = key  # key = (left_context, right_context)
        for token in left_context:
            left_index[token].add(key)
    return left_index

def get_most_common_word(sentences):
    """
    Compute the most common word in the provided sentences.
    """
    word_counts = Counter()
    for sentence in sentences:
        tokens = tokenize(sentence)
        word_counts.update(tokens)
    if word_counts:
        return word_counts.most_common(1)[0][0]
    return "the"

def get_mask_context(tokens, mask_token="masked", window=2):
    """
    Given a tokenized sentence, find the mask token and return its left and right contexts.
    Handles edge cases when the masked word is at the beginning or end.
    """
    if mask_token not in tokens:
        return None, None
    mask_index = tokens.index(mask_token)
    left_context = tuple(tokens[max(0, mask_index - window):mask_index])
    right_context = tuple(tokens[mask_index+1:mask_index+window+1])
    return left_context, right_context

def predict_masked_word(sentence, context_dict, left_index, fallback_word, mask_token="masked", window=2):
    """
    Predict the masked word in a sentence.
    1. Try an exact match on the context.
    2. If none, use a relaxed matching strategy by only considering context keys 
       that appear in the left_index for any token in the test left context.
    """
    tokens = tokenize(sentence)
    left_context, right_context = get_mask_context(tokens, mask_token, window)
    
    # If mask token not found, return fallback.
    if left_context is None or right_context is None:
        return fallback_word
    
    key = (left_context, right_context)
    
    # 1. Exact match
    if key in context_dict:
        candidates = context_dict[key]
        return max(candidates, key=candidates.get)
    
    # 2. Relaxed matching: limit search to keys that appear in left_index for tokens in left_context.
    candidate_scores = Counter()
    candidate_keys = set()
    for token in left_context:
        candidate_keys.update(left_index.get(token, set()))
    
    for train_key in candidate_keys:
        train_left, train_right = train_key
        # Compute a score: count overlapping tokens on left and right.
        score = len(set(train_left).intersection(left_context)) + len(set(train_right).intersection(right_context))
        if score > 0:
            for word, freq in context_dict[train_key].items():
                candidate_scores[word] += score * freq
                
    if candidate_scores:
        return candidate_scores.most_common(1)[0][0]
    
    return fallback_word

# Main execution block
if __name__ == "__main__":
    # Load CSV files (update paths if needed)
    train_file = 'train_set_f.csv'
    test_file = 'test_set_f.csv'
    
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    # Build context dictionary from training sentences.
    # Ensure the column name 'SENTENCES' matches your CSV header.
    train_sentences = train_df['SENTENCES'].tolist()
    context_dict = build_context_dict(train_sentences, window=2)
    
    # Build the left index for faster relaxed matching.
    left_index = build_left_index(context_dict)
    
    # Compute the most common word from training data as a fallback.
    fallback_word = get_most_common_word(train_sentences)
    print("Fallback word:", fallback_word)
    
    # Debug: Print a few test contexts.
    print("\n--- Debug: Sample Test Contexts ---")
    for i in range(5):
        sentence = test_df['MASKED SENTENCES'].iloc[i]
        tokens = tokenize(sentence)
        left_context, right_context = get_mask_context(tokens, mask_token="masked", window=2)
        print(f"Sentence {i+1}: {sentence}")
        print("Tokens:", tokens)
        print("Left Context:", left_context)
        print("Right Context:", right_context)
        print("---")
    
    # Predict the masked word for each test sentence.
    predictions = []
    for sentence in test_df['MASKED SENTENCES']:
        pred = predict_masked_word(sentence, context_dict, left_index, fallback_word, mask_token="masked", window=2)
        predictions.append(pred)
    
    # Debug: Print a few predictions.
    print("\n--- Debug: Sample Predictions ---")
    for i in range(5):
        print("Test Sentence:", test_df['MASKED SENTENCES'].iloc[i])
        print("Predicted Word:", predictions[i])
        print("---")
    
    # Generate and save the submission file.
    submission_df = pd.DataFrame({
        'IDS': test_df['IDS'],  # Ensure this matches your CSV header for identifiers.
        'PREDICTED WORDS': predictions
    })
    submission_df.to_csv('sample_submission2.csv', index=False)
    print("\nSubmission file created: sample_submission2.csv")


Fallback word: the

--- Debug: Sample Test Contexts ---
Sentence 1: The sweat stood upon it in <MASKED> .
Tokens: ['the', 'sweat', 'stood', 'upon', 'it', 'in', 'masked']
Left Context: ('it', 'in')
Right Context: ()
---
Sentence 2: The city was named for Judge <MASKED> R McKee .
Tokens: ['the', 'city', 'was', 'named', 'for', 'judge', 'masked', 'r', 'mckee']
Left Context: ('for', 'judge')
Right Context: ('r', 'mckee')
---
Sentence 3: A <MASKED> of girls are cheering .
Tokens: ['a', 'masked', 'of', 'girls', 'are', 'cheering']
Left Context: ('a',)
Right Context: ('of', 'girls')
---
Sentence 4: Tom resigned as he wasnt <MASKED> valued at work .
Tokens: ['tom', 'resigned', 'as', 'he', 'wasnt', 'masked', 'valued', 'at', 'work']
Left Context: ('he', 'wasnt')
Right Context: ('valued', 'at')
---
Sentence 5: In the disastrous days that followed Maurice was subject to Fredericks <MASKED> .
Tokens: ['in', 'the', 'disastrous', 'days', 'that', 'followed', 'maurice', 'was', 'subject', 'to', 'frederick