In [50]:
import pandas as pd

data: str = '../../data/raw/raw_extended_data/wordle_old.jsonl'
save_path: str = '../../data/processed/processed_extended_data/wordle_old_processed.jsonl'
df = pd.read_json(data, lines=True)

# This works for wordle and wordle with clue

## Filter all conversations that are successful and have an invalid format inside the assistant text

In [51]:
# Define the text to filter out
specific_text = 'INVALID_FORMAT'

# Function to check if specific_text is in any of the dictionaries in the list
def contains_specific_text(row):
      return any(specific_text in d['content'] for d in row)

In [52]:
# Filter the DataFrame
filtered_data = df[~df['chat'].apply(contains_specific_text)]

In [53]:
len(df), len(filtered_data)

(1480, 1369)

# Clean Up all episodes with repeated prompting

In [54]:
def filter_repeated_interaction(chat: list) -> list:
      cleaned_chat = []
      skip_next = False
      for turn in chat:
            if skip_next:
                  skip_next = False
                  continue
            try:
                  if turn['has_error']:
                        skip_next = True
                  else:
                     cleaned_chat.append({
                           'role': turn['role'],
                           'content': turn['content'],
                     })
            except KeyError:
                  cleaned_chat.append(
                        {
                              'role': turn['role'],
                              'content': turn['content'],
                        }
                  )
                      
      return cleaned_chat

len(filtered_data.chat.iloc[0]), len(filter_repeated_interaction(filtered_data.chat.iloc[0]))

(14, 12)

In [55]:
filtered_data.chat = filtered_data.chat.apply(filter_repeated_interaction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.chat = filtered_data.chat.apply(filter_repeated_interaction)


## Finding:
Wordle With clue and critic has a lot of INVALID_FORMAT in its explanations. When this is used to train the model to deliver content, 
it will learn to produce only the INVALID_CONTENT token. This does not help the game to be better played

In [56]:
filtered_data = filtered_data.rename(columns={'target_word': 'target'})
filtered_data = filtered_data.drop(['target_word_difficulty', 'target_word_clue'], axis=1)
filtered_data['player'] = "Player 1"

In [57]:
filtered_data.to_json(save_path, orient='records', lines=True)

In [58]:
filtered_data.head()

Unnamed: 0,game,game_id,model,benchmark_version,experiment,episode,Aborted,Lose,Success,chat,target,main_score,player
0,wordle,1,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_0,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",acute,0.0,Player 1
1,wordle,2,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_1,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",blade,0.0,Player 1
2,wordle,3,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_2,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",wacky,0.0,Player 1
3,wordle,4,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_3,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",extol,0.0,Player 1
4,wordle,5,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_4,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",polka,0.0,Player 1


In [59]:
df.head()

Unnamed: 0,game,game_id,model,benchmark_version,experiment,episode,Aborted,Lose,Success,chat,target_word,target_word_difficulty,target_word_clue,main_score
0,wordle,1,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_0,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",acute,high_frequency,sharp,0.0
1,wordle,2,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_1,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",blade,high_frequency,plow part,0.0
2,wordle,3,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_2,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",wacky,high_frequency,amusingly eccentric,0.0
3,wordle,4,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_3,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",extol,high_frequency,eulogize,0.0
4,wordle,5,claude-v1.3-t0.0--claude-v1.3-t0.0,v0.9,0_high_frequency_words_no_clue_no_critic,episode_4,0,1,0,"[{'role': 'user', 'content': 'You are a langua...",polka,high_frequency,lively dance,0.0
