In [1]:
import pandas as pd
import unicodedata
import re

# 1. Dictionary words

In [2]:
# Devious way of reading txt straight to Series
# (sep is what it is so it only gets one column)
word_list = pd.read_csv("datasets/Words.txt", sep="&"*int(1e6), engine="python", header=None)[0].dropna()
word_list = word_list[
    word_list.str.islower() # delete proper nouns
    & word_list.str.isalpha() # delete things with punctuation
]

In [3]:
word_list.sample(10)

124069           englyns
309350           predoom
286338        passometer
445850           vagally
357860     semiexecutive
299851       planetogeny
69211              choko
380574           squaddy
106928       diplocardia
90437     cryobiological
Name: 0, dtype: object

In [7]:
char_set = set()
for word in word_list:
    char_set |= set(word)
"".join(sorted(char_set))

'abcdefghijklmnopqrstuvwxyz'

In [43]:
word_list.to_csv("datasets/cleaned/words.csv", header=None, index=False)

# 2. Baseball players

In [9]:
cols = ["nameFirst", "nameLast", "birthYear", "birthCountry", "playerID"]
renamed = ["first_name", "last_name", "birth_year", "birth_country", "id"]
players = pd.read_csv("datasets/People.csv")[cols].dropna().rename(columns=dict(zip(cols, renamed)))

In [10]:
def replace_diacritics(name_ser):
    # Often you have a name with an e with an accent on that
    # that will be a nuisance for the model, so turn it to an e with no accent
    normalised = name_ser.apply(lambda x: unicodedata.normalize("NFD", x))
    without_diacritics = normalised.apply(lambda x: "".join([c for c in x if unicodedata.category(c) != "Mn"]))
    return without_diacritics

In [11]:
for p in ["first", "last"]:
    players[f"edited_{p}"] = replace_diacritics(players[f"{p}_name"]).str.lower()

In [12]:
"".join(sorted(set(list(players[["edited_first", "edited_last"]].sum().sum()))))

" '-.abcdefghijklmnopqrstuvwxyz"

In [14]:
players.to_csv("datasets/cleaned/players.csv", index=False)

# 3. Star wars characters

In [70]:
names = replace_diacritics(pd.read_csv("datasets/full_sw_names.csv")["name"])
allowed_chars = " '-.abcdefghijklmnopqrstuvwxyz"
legal_names = names[names.str.lower().apply(lambda s: set(s).issubset(allowed_chars))] # i.e., no numbers etc
proper_names = legal_names[~legal_names.str.contains(r' [a-z]', regex=True)] # i.e., nothing like "King of X"
identified = proper_names[~(proper_names.str[:len("Unidentified")] == "Unidentified")]

In [87]:
identified.sample(10)

19522    Zubindi Ebsuk
42588            Tenek
10806     Luha Kellaro
29445       Falco Sang
18361       Jaden Dala
45947     Rachel Gutek
18903       Emf Diddar
25459       Mankuskett
37674       Crank Flat
38266         Eradicus
Name: name, dtype: object

In [90]:
identified = replace_diacritics(identified).str.lower()

In [92]:
identified.to_csv("datasets/cleaned/characters.csv", index=False)

# Get more info about players

In [115]:
import pybaseball
import numpy as np

In [96]:
pybaseball.cache.enable()
stats = pybaseball.statcast(start_dt="2025-01-01", end_dt="2025-12-31")

This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
100%|█████████████████████████████████████████| 246/246 [00:56<00:00,  4.36it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [131]:
stats["batter_team"] = np.where(
    stats["inning_topbot"] == "Top",
    stats["away_team"],
    stats["home_team"]
)
stats["pitcher_team"] = np.where(
    stats["inning_topbot"] == "Top",
    stats["home_team"],
    stats["away_team"]
)
batters = stats[["batter", "batter_team"]].value_counts().reset_index().drop("count", axis=1).rename(
    columns={"batter": "player", "batter_team": "team"}
)
pitchers = stats[["pitcher", "pitcher_team"]].value_counts().reset_index().drop("count", axis=1).rename(
    columns={"pitcher": "player", "pitcher_team": "team"}
)
total = pd.concat([batters, pitchers], ignore_index=True)

In [132]:
lookup_df = pybaseball.playerid_reverse_lookup(total["player"], key_type="mlbam")

In [137]:
combined = total.rename(columns={"player": "key_mlbam"}).merge(lookup_df).drop_duplicates()

In [139]:
combined["name"] = replace_diacritics(combined["name_first"] + " " + combined["name_last"])

In [141]:
combined.to_csv("datasets/cleaned/team_info.csv", index=False)