In [1]:
import os
from collections import defaultdict
import re
import csv

import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [5]:
csv_dir = "../raw_data"
files = os.listdir(csv_dir)

In [6]:
a_char_counts = defaultdict(int)
q_char_counts = defaultdict(int)
a_chars = set()
q_chars = set()
for file in tqdm(files):
    df = pd.read_csv(os.path.join(csv_dir, file))
    df.drop(columns=["definition", "clue_number","puzzle_date","puzzle_name","source_url","source"], inplace=True)
    df.dropna(how="any", ignore_index=True, inplace=True)
    
    a_chars.update(set(''.join(df["answer"].astype(str))))
    q_chars.update(set(''.join(df["clue"].astype(str))))

    for char in a_chars:
        a_char_counts[char] += df["answer"].str.contains(re.escape(char)).sum()
        
    for char in q_chars:
        q_char_counts[char] += df["clue"].str.contains(re.escape(char)).sum()

100%|██████████| 664/664 [00:27<00:00, 23.75it/s]


In [7]:
a_char_counts_df = pd.DataFrame.from_dict(a_char_counts, orient='index', columns=['Count']).reset_index()
q_char_counts_df = pd.DataFrame.from_dict(q_char_counts, orient='index', columns=['Count']).reset_index()
a_char_counts_df = a_char_counts_df.rename(columns={'index': 'Character'})
q_char_counts_df = q_char_counts_df.rename(columns={'index': 'Character'})
a_char_counts_df.to_csv("char_counts_a.csv", index=False, quoting=csv.QUOTE_ALL)
q_char_counts_df.to_csv("char_counts_q.csv", index=False, quoting=csv.QUOTE_ALL)

In [17]:
valid = []
for file in tqdm(files):
    df = pd.read_csv(os.path.join(csv_dir, file))
    df.drop(columns=["definition", "clue_number","puzzle_date","puzzle_name","source_url","source"], inplace=True)
    df.dropna(how="any", ignore_index=True, inplace=True)

    df = df[df["answer"].str.match(r"^[A-Z]+$")]
    df = df[df["clue"].str.match(r"^[a-zA-Z0-9\s\(\),\.\?'‘’“”:\-!]+$")]
    df = df[df['clue'].str.len() <= 96]
    df = df[df['answer'].str.len() <= 32]
    valid.append(df.copy())

df = pd.concat(valid, ignore_index=True)
# df.sort_values(by=['rowid'], ascending=True, inplace=True)
df.drop(columns=['rowid'], inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

test_df = df.iloc[:50_000]
eval_df = df.iloc[50_000:100_000]
train_df = df.iloc[100_000:]
train_df.to_csv("../train.csv", index=False, quoting=csv.QUOTE_ALL)
eval_df.to_csv("../eval.csv", index=False, quoting=csv.QUOTE_ALL)
test_df.to_csv("../test.csv", index=False, quoting=csv.QUOTE_ALL)

100%|██████████| 664/664 [00:02<00:00, 256.47it/s]


In [None]:
%matplotlib inline
lengths = df["clue"].str.len().tolist()
from collections import Counter
counter = Counter(lengths)

plt.bar(counter.keys(), counter.values())
plt.show()