In [3]:
# dataset

!wget https://pilehvar.github.io/wic/package/WiC_dataset.zip

--2023-01-30 11:00:30--  https://pilehvar.github.io/wic/package/WiC_dataset.zip
Resolving pilehvar.github.io (pilehvar.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
Connecting to pilehvar.github.io (pilehvar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 275513 (269K) [application/zip]
Saving to: ‘WiC_dataset.zip’


2023-01-30 11:00:30 (8.09 MB/s) - ‘WiC_dataset.zip’ saved [275513/275513]



In [4]:
!unzip WiC_dataset.zip

Archive:  WiC_dataset.zip
   creating: dev/
  inflating: dev/dev.data.txt        
  inflating: dev/dev.gold.txt        
   creating: test/
  inflating: test/test.data.txt      
   creating: train/
  inflating: train/train.data.txt    
  inflating: train/train.gold.txt    
  inflating: README.txt              


In [5]:
import pandas as pd
from pathlib import Path

df_train = pd.read_csv(
    "train/train.data.txt", 
    sep='\t', 
    names=[
        'target_word',
        'PoS',
        'index1-index2',
        'example_1',
        'example_2',
        ],
)


with Path("train/train.gold.txt").open() as f:
  gold_labels = [label.strip('\n') for label in f.readlines()]

df_train['label'] = gold_labels

df_train.head()

Unnamed: 0,target_word,PoS,index1-index2,example_1,example_2,label
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .,F
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,F
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...,F
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,T
4,academy,N,1-2,The Academy of Music .,The French Academy .,F


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   target_word    5428 non-null   object
 1   PoS            5428 non-null   object
 2   index1-index2  5428 non-null   object
 3   example_1      5428 non-null   object
 4   example_2      5428 non-null   object
 5   label          5428 non-null   object
dtypes: object(6)
memory usage: 254.6+ KB


In [7]:
df_dev = pd.read_csv(
    "dev/dev.data.txt", 
    sep='\t', 
    names=[
        'target_word',
        'PoS',
        'index1-index2',
        'example_1',
        'example_2',
        ],
)


with Path("dev/dev.gold.txt").open() as f:
  gold_labels = [label.strip('\n') for label in f.readlines()]

df_dev['label'] = gold_labels

df_dev.head()

Unnamed: 0,target_word,PoS,index1-index2,example_1,example_2,label
0,board,N,2-2,Room and board .,He nailed boards across the windows .,F
1,circulate,V,0-4,Circulate a rumor .,This letter is being circulated among the facu...,F
2,hook,V,0-1,Hook a fish .,"He hooked a snake accidentally , and was so sc...",T
3,recreation,N,1-9,For recreation he wrote poetry and solved cros...,Drug abuse is often regarded as a form of recr...,T
4,domesticity,N,4-6,Making a hobby of domesticity .,A royal family living in unpretentious domesti...,F


In [8]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   target_word    638 non-null    object
 1   PoS            638 non-null    object
 2   index1-index2  638 non-null    object
 3   example_1      638 non-null    object
 4   example_2      638 non-null    object
 5   label          638 non-null    object
dtypes: object(6)
memory usage: 30.0+ KB


In [9]:
import numpy as np

df_test = pd.read_csv(
    "test/test.data.txt", 
    sep='\t', 
    names=[
        'target_word',
        'PoS',
        'index1-index2',
        'example_1',
        'example_2',
        ],
)

df_test['label'] = np.nan

df_test.head()

Unnamed: 0,target_word,PoS,index1-index2,example_1,example_2,label
0,defeat,N,4-4,It was a narrow defeat .,The army 's only defeat .,
1,groom,V,0-1,Groom the dogs .,Sheila groomed the horse .,
2,penetration,N,1-1,The penetration of upper management by women .,"Any penetration , however slight , is sufficie...",
3,hit,V,1-3,We hit Detroit at one in the morning but kept ...,An interesting idea hit her .,
4,deliberation,N,6-2,He was a man of judicial deliberation .,A little deliberation would have deterred them .,


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   target_word    1400 non-null   object 
 1   PoS            1400 non-null   object 
 2   index1-index2  1400 non-null   object 
 3   example_1      1400 non-null   object 
 4   example_2      1400 non-null   object 
 5   label          0 non-null      float64
dtypes: float64(1), object(5)
memory usage: 65.8+ KB


In [11]:
df_train['split'] = 'train'
df_dev['split'] = 'dev'
df_test['split'] = 'test'

df_all = pd.concat([df_train, df_dev, df_test])

In [12]:
POS_MAPPER = {
    'N': 'noun',
    'V': 'verb',
}

def make_prompt(sentence_1:str, sentence_2: str, target_word: str, pos: str):
  return f'Are the given texts expressing the same sense of the {POS_MAPPER[pos]} {target_word}: “{sentence_1}” and “{sentence_2}”? \
Return your answer as a letter: "T" if the sense is the same or "F" if it’s not the same.'

In [13]:
prompts = []

for idx, row in df_all.iterrows():
  prompt = make_prompt(row['example_1'], row['example_2'], row['target_word'], row['PoS'])
  prompts.append(prompt)


df_all['prompt'] = prompts
df_all['chatgpt_answer'] = np.nan

In [14]:
df_all.head()

Unnamed: 0,target_word,PoS,index1-index2,example_1,example_2,label,split,prompt,chatgpt_answer
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .,F,train,Are the given texts expressing the same sense ...,
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,F,train,Are the given texts expressing the same sense ...,
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...,F,train,Are the given texts expressing the same sense ...,
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,T,train,Are the given texts expressing the same sense ...,
4,academy,N,1-2,The Academy of Music .,The French Academy .,F,train,Are the given texts expressing the same sense ...,


In [15]:
df_all.to_csv("wic_prompts.tsv", sep="\t")

In [16]:
df_all_explain = df_all.copy()

df_all_explain['prompt'] = df_all_explain['prompt'].apply(lambda x: x + ' Explain your answer.')

df_all_explain.head()

Unnamed: 0,target_word,PoS,index1-index2,example_1,example_2,label,split,prompt,chatgpt_answer
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .,F,train,Are the given texts expressing the same sense ...,
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,F,train,Are the given texts expressing the same sense ...,
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...,F,train,Are the given texts expressing the same sense ...,
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,T,train,Are the given texts expressing the same sense ...,
4,academy,N,1-2,The Academy of Music .,The French Academy .,F,train,Are the given texts expressing the same sense ...,


In [17]:
df_all_explain.to_csv("wic_prompts_explain.tsv", sep="\t")