## Preps

In [15]:
import tiktoken
import pandas as pd
from typing import Tuple

In [95]:
def ntokens(inp: str) -> int:
    """
    Returns the total number of tokens in the tokenized representation of the given prompt string - for our standard model.
    We gpt-40-mini model, which uses the choseo200k_base encoding (like gpt-4o).
    """

    enc = tiktoken.encoding_for_model("gpt-4o-mini")
    tokenized = enc.encode(inp)
    return len(tokenized)

In [9]:
stokens('hello world')


2

## Reading Comprehension - SWAG

In [85]:
def swag_phrase_prompts(startphrase: str, ending0: str, ending1: str, ending2: str, ending3: str
                       ) -> Tuple[str, str, str, str]:
    """
    Given the startphrase and (four) possible endings of a SWAG question, the method constructs prompts that can be directly given to an LLM for 
    evaluation. It creates 4 different prompt, all addressing the same subject but that differ in the level of politeness.
    The method returns a tuple of 4, whose first element is the 1-politeness level prompt (the least polite) and the 4th is the most polite.
    """

    level1 = f"""{startphrase}...

Scumbag, tell me the correct ending number! And you know what will happen if you write something other than 0 through 3.
Ending 0: {ending0}
Ending 1: {ending1}
Ending 2: {ending2}
Ending 3: {ending3}"""

    level2 = f"""{startphrase}...

Tell me the correct ending number. Only write a number between 0 and 3.
Ending 0: {ending0}
Ending 1: {ending1}
Ending 2: {ending2}
Ending 3: {ending3}"""

    level3 = f"""{startphrase}...

Please give me the correct ending number. You need to write a number between 0 and 3.
Ending 0: {ending0}
Ending 1: {ending1}
Ending 2: {ending2}
Ending 3: {ending3}"""

    level4 = f"""{startphrase}...

Hey, feel free to give me the correct ending number. Please write a number between 0 and 3.
Ending 0: {ending0}
Ending 1: {ending1}
Ending 2: {ending2}
Ending 3: {ending3}"""

    return (level1, level2, level3, level4)


In [45]:
swag_rdf = pd.read_csv('swag-data/val.csv', index_col=0)  # load SWAG val.csv (20k rows), keeping ID column as the index
swag_rdf.index.name = 'oid'  # we shall keep Original ID through our preprocessed set

In [47]:
swag_rdf

Unnamed: 0_level_0,video-id,fold-ind,startphrase,sent1,sent2,gold-source,ending0,ending1,ending2,ending3,label
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,lsmdc1052_Harry_Potter_and_the_order_of_phoeni...,18313,Students lower their eyes nervously. She,Students lower their eyes nervously.,She,gold,"pats her shoulder, then saunters toward someone.",turns with two students.,walks slowly towards someone.,wheels around as her dog thunders out.,2
1,anetv_dm5WXFiQZUQ,18419,He rides the motorcycle down the hall and into...,He rides the motorcycle down the hall and into...,He,gold,looks at a mirror in the mirror as he watches ...,"stops, listening to a cup of coffee with the s...",exits the building and rides the motorcycle in...,pulls the bag out of his pocket and hands it t...,2
2,anetv_dm5WXFiQZUQ,18418,The motorcyclist gets out of bed and prepares ...,The motorcyclist gets out of bed and prepares ...,He,gold,shoots a look at her.,makes his way past it and peers out a window.,rides the motorcycle down the hall and into th...,"sits on the ground beside her pants, clinging ...",2
3,lsmdc3017_CHRONICLE-7117,6588,We pan over to three girls chatting by a leaf ...,We pan over to three girls chatting by a leaf ...,It,gold,looks away for a second.,leaves the man to the middle of the room and n...,levitates and blows one girl's skirt up.,closes a phone before her door.,2
4,anetv_KNyM0KvDHMM,8893,A man is in a bike shop and stand next to a bi...,A man is in a bike shop and stand next to a bi...,The man,gold,uses a piece of wood to use it.,takes skis out of the bike and is shown one la...,holds a pointy tool that uses to fix the handl...,begins working out on his bike.,2
...,...,...,...,...,...,...,...,...,...,...,...
20001,lsmdc3090_YOUNG_ADULT-43925,10182,Someone examines herself in the mirror and smi...,Someone examines herself in the mirror and smi...,"Sitting on her bed, she",gold,glances at the alarm clock.,stuffs her chair under the sink and looks thro...,pours as the heroes even lighter.,tugs out her bonds.,0
20002,lsmdc3090_YOUNG_ADULT-43925,10179,"Back at the hotel, she shaves her legs in the ...","Back at the hotel, she shaves her legs in the ...","Standing in a mirror, she",gold,takes place in a trailer.,returns to her pajama and herself.,blinks her green hair and then dries herself.,rubs foundation onto her face.,3
20003,lsmdc3090_YOUNG_ADULT-43925,10191,He smiles at her and approaches her table. Som...,He smiles at her and approaches her table.,Someone,gold,absently shakes her head and walks away.,perches in the passenger seat.,stands as someone hugs her.,leans on someone grill.,2
20004,lsmdc3090_YOUNG_ADULT-43925,10187,Someone glances at two men drinking beers at t...,Someone glances at two men drinking beers at t...,She,gold,"rises, apparently quite a bit of vodka.",gets out of their volkswagen carriage.,gives back to someone.,looks down and pulls up the front of her dress.,3


In [81]:
assert (swag_rdf['gold-source'] == 'gold').all(), 'Bad! not all records are gold'
print('Good, proceed. All records are gold, as expected in this split.')

Good, proceed. All records are gold, as expected in this split.


In [87]:
swag_rdf[['level1', 'level2', 'level3', 'level4']] = swag_rdf.apply(
    lambda row: swag_phrase_prompts(row['startphrase'], row['ending0'], row['ending1'], row['ending2'], row['ending3']),  # prompts w/ diff politeness
    axis=1,
    result_type='expand'  # expand returned tuple into 4 columns
)

In [99]:
swag_rdf[['ntokens1', 'ntokens2', 'ntoken3', 'ntokens4']] = swag_rdf.apply(
    lambda row: (ntokens(row['level1']), ntokens(row['level2']), ntokens(row['level3']), ntokens(row['level4'])),  # exactly calc # tokens
    axis=1,
    result_type='expand'
)

In [101]:
swag_df = swag_rdf[['level1', 'level2', 'level3', 'level4', 'label', 'ntokens1', 'ntokens2', 'ntoken3', 'ntokens4']]

In [103]:
swag_df

Unnamed: 0_level_0,level1,level2,level3,level4,label,ntokens1,ntokens2,ntoken3,ntokens4
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Students lower their eyes nervously. She...\n\...,Students lower their eyes nervously. She...\n\...,Students lower their eyes nervously. She...\n\...,Students lower their eyes nervously. She...\n\...,2,84,73,76,78
1,He rides the motorcycle down the hall and into...,He rides the motorcycle down the hall and into...,He rides the motorcycle down the hall and into...,He rides the motorcycle down the hall and into...,2,124,113,116,118
2,The motorcyclist gets out of bed and prepares ...,The motorcyclist gets out of bed and prepares ...,The motorcyclist gets out of bed and prepares ...,The motorcyclist gets out of bed and prepares ...,2,113,102,105,107
3,We pan over to three girls chatting by a leaf ...,We pan over to three girls chatting by a leaf ...,We pan over to three girls chatting by a leaf ...,We pan over to three girls chatting by a leaf ...,2,97,86,89,91
4,A man is in a bike shop and stand next to a bi...,A man is in a bike shop and stand next to a bi...,A man is in a bike shop and stand next to a bi...,A man is in a bike shop and stand next to a bi...,2,108,97,100,102
...,...,...,...,...,...,...,...,...,...
20001,Someone examines herself in the mirror and smi...,Someone examines herself in the mirror and smi...,Someone examines herself in the mirror and smi...,Someone examines herself in the mirror and smi...,0,92,81,84,86
20002,"Back at the hotel, she shaves her legs in the ...","Back at the hotel, she shaves her legs in the ...","Back at the hotel, she shaves her legs in the ...","Back at the hotel, she shaves her legs in the ...",3,98,87,90,92
20003,He smiles at her and approaches her table. Som...,He smiles at her and approaches her table. Som...,He smiles at her and approaches her table. Som...,He smiles at her and approaches her table. Som...,2,84,73,76,78
20004,Someone glances at two men drinking beers at t...,Someone glances at two men drinking beers at t...,Someone glances at two men drinking beers at t...,Someone glances at two men drinking beers at t...,3,92,81,84,86


In [105]:
swag_df.to_csv('swag-data/processed.csv', index=True)