In [None]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd

# Load the SNLI dataset
snli = load_dataset('snli')

# Access different splits
train_data = snli['train']
validation_data = snli['validation']
test_data = snli['test']

# Print a few examples
print(train_data[0])
print(validation_data[0])
print(test_data[0])

In [None]:
import re

# Define temporal keywords
temporal_keywords = [
    # Explicit references
    'today', 'tomorrow', 'yesterday', 'now', 'soon', 'later', 'before', 'after',
    'day', 'week', 'month', 'year', 'hour', 'minute', 'second', 'morning',
    'evening', 'night', 'noon', 'midnight', 'anniversary',
    
    # Days of the week
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
    
    # Months
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December',

    # Seasons
    'spring', 'summer', 'fall', 'autumn', 'winter',

    # Periods & Eras
    'decade', 'century', 'millennium', 'epoch', 'era', 

    # General terms
    'annual', 'biannual', 'quarterly', 'hourly', 'daily', 'weekly', 'quarter',
    'monthly', 'fortnight', 'biweekly', 'bimonthly', 'semester', 'trimester',
    
    # Relative terms
    'past', 'future', 'current', 'upcoming', 'recent', 'lately', 'ago', 'in advance', 'later',
    'previous', 'next', 'moment', 'time', 'when', 'while', 'duration', 'period', 'early', 'earlier',

    # Implicit/temporal actions
    'wait', 'postpone', 'delay', 'reschedule', 'expire', 'due', 'schedule',
    'begin', 'start', 'end', 'finish', 'commence', 'conclude', 'last', 'extend',

    # Temporal transitions & connectors
    'until', 'by the time', 'as soon as', 'whenever', 'since', 'during', 'whilst',

    # Other temporal entities
    'sunset', 'sunrise', 'dusk', 'dawn', 'midday', 'eve', 'annually', 'eventually',
    'seldom', 'often', 'always', 'never', 'sometimes', 'usually', 'frequently', 
    'occasionally', 'rarely', 'just', 'once', 'still'
]

def contains_temporal_keyword(sentence):
    """Return True if the sentence contains any temporal keyword, else False."""
    sentence = sentence.lower()
    for keyword in temporal_keywords:
        # Search for whole word matches
        if re.search(r'\b' + keyword + r'\b', sentence):
            return True
    return False

def is_temporal(example):
    """Return True if either premise or hypothesis contains a temporal keyword."""
    premise = example['premise']
    hypothesis = example['hypothesis']
    return contains_temporal_keyword(premise) or contains_temporal_keyword(hypothesis)

# Filter temporally relevant sentences
train_temporal = snli['train'].filter(is_temporal)
validation_temporal = snli['validation'].filter(is_temporal)
test_temporal = snli['test'].filter(is_temporal)

# Print a few examples
print(train_temporal[0])
print(validation_temporal[0])
print(test_temporal[0])

In [None]:
# Combine the datasets
all_temporal = concatenate_datasets([train_temporal, validation_temporal, test_temporal])

# Convert the combined dataset into a list of dictionaries
all_temporal_list = [{"premise": p, "hypothesis": h, "label": l} for p, h, l in zip(all_temporal["premise"], all_temporal["hypothesis"], all_temporal["label"])]

def format_example(example):
    label_to_option = {0: "entailment", 1: "neutral", 2: "contradiction"}
    
    formatted_example = {
        "Premise": example["premise"],
        "Hypothesis": example["hypothesis"],
        "Option A": "entailment",
        "Option B": "neutral",
        "Option C": "contradiction",
        "Answer": chr(65 + example["label"])  # Converts 0 to A, 1 to B, and 2 to C
    }
    return formatted_example

formatted_data = [format_example(example) for example in all_temporal_list]
temporal_snli_df = pd.DataFrame(formatted_data)

In [None]:
mnli = load_dataset('glue', 'mnli')
train_data = mnli['train']
validation_matched_data = mnli['validation_matched']
validation_mismatched_data = mnli['validation_mismatched']

# Print a few examples
print(train_data[0])
print(validation_matched_data[0])
print(validation_mismatched_data[0])

In [None]:
# Filter temporally relevant sentences
train_temporal = mnli['train'].filter(is_temporal)
validation_matched_temporal = mnli['validation_matched'].filter(is_temporal)
validation_mismatched_temporal = mnli['validation_mismatched'].filter(is_temporal)

# Print a few examples
print(train_temporal[0])
print(validation_matched_temporal[0])
print(validation_mismatched_temporal[0])

In [None]:
# Combine the datasets
all_temporal = concatenate_datasets([train_temporal, validation_matched_temporal, validation_mismatched_temporal])

# Convert the combined dataset into a list of dictionaries
all_temporal_list = [{"premise": p, "hypothesis": h, "label": l} for p, h, l in zip(all_temporal["premise"], all_temporal["hypothesis"], all_temporal["label"])]

formatted_data = [format_example(example) for example in all_temporal_list]
temporal_mnli_df = pd.DataFrame(formatted_data)

In [None]:
temporal_snli_df['Category'] = 'SNLI'
temporal_mnli_df['Category'] = 'MNLI'

In [None]:
df_temporal_nli = pd.concat([temporal_snli_df, temporal_mnli_df], ignore_index=True)

In [None]:
df_temporal_nli = df_temporal_nli.drop_duplicates(subset=['Premise', 'Hypothesis'])

In [None]:
df_temporal_nli = df_temporal_nli[~df_temporal_nli['Answer'].str.contains("@")]