In [1]:
import pandas as pd

data: str = '../../data/raw/raw_data_all/wordle_withcritic_all.jsonl'
save_path: str = '../../data/processed/processed_data_all/wordle_withcritic_all_processed.jsonl'
df = pd.read_json(data, lines=True)

In [2]:
columns_to_keep_p1: list = ['game', 'benchmark_version', 'game_id', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success', 'chat_p1', 'target', 'main_score']

columns_to_keep_p2: list = ['game', 'benchmark_version', 'game_id', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success', 'chat_p2', 'target', 'main_score']

df['target'] = df.target_word

In [3]:
# Create a new DataFrame with duplicated entries
df_player1 = df[columns_to_keep_p1].rename(columns={'chat_p1': 'chat'})
df_player1['player'] = 'player 1'

df_player2 = df[columns_to_keep_p2].rename(columns={'chat_p2': 'chat'})
df_player2['player'] = 'player 2'

# Concatenate both DataFrames
result_df = pd.concat([df_player1, df_player2], ignore_index=True)

# Show the resulting 
len(result_df)

10832

## Filter all conversations that are successful and have an invalid format inside the assistant text

In [4]:
# Define the text to filter out
specific_text = 'INVALID_FORMAT'

# Function to check if specific_text is in any of the dictionaries in the list
def contains_specific_text(row):
      return any(specific_text in d['content'] for d in row)

In [5]:
# Filter the DataFrame
filtered_data = result_df[~result_df['chat'].apply(contains_specific_text)]

In [6]:
def filter_repeated_interaction(chat: list) -> list:
      cleaned_chat = []
      skip_next = False
      for turn in chat:
            if skip_next:
                  skip_next = False
                  continue
            try:
                  if turn['has_error']:
                        skip_next = True
                  else:
                        cleaned_chat.append({
                              'role': turn['role'],
                              'content': turn['content'],
                        })
            except KeyError:
                  cleaned_chat.append(
                        {
                              'role': turn['role'],
                              'content': turn['content'],
                        }
                  )

      return cleaned_chat

In [7]:
filtered_data.chat = filtered_data.chat.apply(filter_repeated_interaction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.chat = filtered_data.chat.apply(filter_repeated_interaction)


In [8]:
len(result_df), len(filtered_data)

(10832, 10310)

## Finding:
Wordle With clue and critic has a lot of INVALID_FORMAT in its explanations. When this is used to train the model to deliver content, 
it will learn to produce only the INVALID_CONTENT token. This does not help the game to be better played

In [9]:
filtered_data.to_json(save_path, orient='records', lines=True)

In [10]:
filtered_data.columns

Index(['game', 'benchmark_version', 'game_id', 'model', 'experiment',
       'episode', 'Aborted', 'Lose', 'Success', 'chat', 'target', 'main_score',
       'player'],
      dtype='object')