In [None]:
import pandas as pd
import spacy
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the SpaCy language model
nlp = spacy.load("en_core_web_sm")

In [None]:
# generate wrong endings
!pip install transformers

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data = pd.read_csv('temporal_roc.csv')

def generate_wrong_ending(story, prompt_for_wrong_ending):
    input_text = story + prompt_for_wrong_ending
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(input_ids, max_length=120, num_return_sequences=1, temperature=1.0, pad_token_id=tokenizer.eos_token_id)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)

    first_sentence = output_text[len(input_text):].split('.')[0] + '.'

    if len(first_sentence.split(' ')) < 4:
        first_sentence = output_text[len(input_text):].split('.')[1] + '.'
        
    return first_sentence

# Define a prompt for generating wrong endings
prompt_for_wrong_ending = "Unexpectedly,"

data['wrong_ending'] = data['story'].apply(lambda x: generate_wrong_ending(x, prompt_for_wrong_ending))

In [None]:
df1 = pd.read_csv("v1.csv")
df2 = pd.read_csv("v2.csv")
df3 = pd.read_csv("v3.csv")

In [None]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
df = df.drop_duplicates(subset=['InputStoryid'])

In [None]:
def is_temporal_story(story):
    temporal_keywords = [
    'before', 'after', 'recently', 'now', 'then', 
    'yesterday', 'tomorrow', 'earlier', 'later', 
    'today', 'tonight', 'soon', 'later', 'nowadays',
    'currently', 'presently', 'eventually', 'ultimately', 
    'suddenly', 'immediately', 'momentarily', 'previously', 
    'formerly', 'periodically', 'seasonally', 'monthly', 
    'weekly', 'daily', 'hourly', 'annually', 'biennially',
    'century', 'decade', 'millennium', 'future', 'past',
    'time', 'minute', 'hour', 'day', 'week', 'month', 'year',
    'morning', 'noon', 'evening', 'night', 'moment', 'instant', 
    'duration', 'temporarily', 'intermittently', 'frequently',
    'always', 'never', 'sometimes', 'often', 'rarely', 'usually'
    ]

    temporal_verbs = [
    'begin', 'begins', 'began',
    'end', 'ends', 'ended',
    'start', 'starts', 'started',
    'finish', 'finishes', 'finished',
    'last', 'lasts', 'lasted',
    'continue', 'continues', 'continued',
    'commence', 'commences', 'commenced',
    'cease', 'ceases', 'ceased',
    'resume', 'resumes', 'resumed',
    'expire', 'expires', 'expired',
    'linger', 'lingers', 'lingered',
    'elapse', 'elapses', 'elapsed',
    'postpone', 'postpones', 'postponed',
    'procrastinate', 'procrastinates', 'procrastinated'
    ]

    
    for sentence in story:
        doc = nlp(sentence)
        
        # Check for temporal keywords and verbs in the sentence
        if any(token.text.lower() in temporal_keywords for token in doc) or any(token.lemma_.lower() in temporal_verbs for token in doc):
            return True
        
        # Check for temporal named entities
        if any(ent.label_ in ["DATE", "TIME"] for ent in doc.ents):
            return True
        
    return False

In [None]:
temporal_stories = df[df[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4']].apply(lambda x: is_temporal_story(x), axis=1)]

In [None]:
def capitalize_first_letter(text):
    if text and isinstance(text, str):
        text = text.strip()
        return text[0].capitalize() + text[1:]
    return text

concatenated_df['wrong_ending'] = concatenated_df['wrong_ending'].apply(capitalize_first_letter)