In [None]:
# Minimal dependencies for the cleaning pipeline
!pip -q install pandas==2.3.2 numpy==2.3.2 tqdm==4.67.1 nltk==3.9.1


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.2 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.3.2 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.2 which is incompatible.
cupy-cuda12x 13.3.0 requires numpy<2.3,>=1.22, b

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
import os

# --------------- Hugging Face token ---------------
os.environ["HF_TOKEN"] = "YOUR_TOKEN_HERE"
login(os.environ["HF_TOKEN"])


Mounted at /content/drive


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import os
import numpy as np
import pandas as pd
import json
import random
import collections
from tqdm import tqdm

import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# ---------- Your project root that contains `data/` ----------
BASE_PATH = r"/content/drive/My Drive/associations-ANLP"

# Convenience join
def P(*parts):
    return os.path.join(BASE_PATH, *parts)

# Ensure output folder exists (no summary_tables at all)
os.makedirs(P("data", "intermediate_preprocess_dataset_using_LWOW_code"), exist_ok=True)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# ========== Only the functions actually used by the cleaning pipeline ==========

def loadSimplifiedSWOW():
    df_orig = pd.read_csv(P('data', 'original_dataset', 'SWOW-EN.R100.csv'))
    countries = pd.read_csv(P('data', 'mapping_tables', 'country.csv'))

    df = df_orig.copy()
    # Gender
    df = df.replace({'gender': {'Fe': 'Female', 'Ma': 'Male', 'X': 'Unknown'}})
    # Education
    df.education = df.education.fillna('Unknown')
    df = df.replace({'education': {
        1.0: 'No education',
        2.0: 'Elementary school',
        3.0: 'High school',
        4.0: 'Bachelor degree',
        5.0: 'Master degree'
    }})
    # Native Language
    eng = ['Australia','Canada','Ireland','New Zealand','United Kingdom','United States','Other_English']
    df['nativeLanguage'] = np.where(df['nativeLanguage'].isin(eng), 'English', 'Not English')
    # Country
    df['country'] = np.where(df['country'].isin(countries.value.values), df['country'], 'Unknown')
    # Cue (convert to strings)
    df['cue'] = [str(x) for x in df.cue.values]
    # Responses (convert NANs to blanks)
    df.R1 = df.R1.fillna('')
    df.R2 = df.R2.fillna('')
    df.R3 = df.R3.fillna('')
    # Keep only needed vars
    df = df[['age','gender','nativeLanguage','country','education','cue','R1','R2','R3']]
    # Sort (kept for parity)
    df_sorted = df.sort_values(by=['cue','age','gender','nativeLanguage','country','education'])
    return df_sorted

def loadTextFile(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    out = []
    for line in lines:
        line = line.strip()
        if '\t' in line:
            line = line.split('\t')
        out.append(line)
    return out

# ---------- Cleaning stack ----------
def cue100(df1, unqCues):
    random.seed(30)
    df1 = df1[df1['cue'].isin(list(unqCues))]  # Only keep cues in original
    c = list(df1['cue'])
    cCount = collections.Counter(c)
    over100, under100 = {}, {}
    for key, value in cCount.items():
        if value > 100:
            over100[key] = value
        if value < 100:
            under100[key] = value

    df = df1.copy()
    # Remove rows for cues that appear more than 100 times
    if len(over100) > 0:
        rows_to_remove = []
        for c, count in over100.items():
            dfCue = df1[df1['cue'] == c]
            surplus = count - 100
            rows_to_remove.append(random.sample(dfCue.index.tolist(), surplus))
        rows_to_remove = [idx for group in rows_to_remove for idx in group]
        df = df.drop(rows_to_remove)

    # Add rows for cues that appear less than 100 times
    if len(under100) > 0:
        for c, count in under100.items():
            deficit = 100 - count
            rows_to_add = pd.DataFrame(
                zip([c]*deficit, ['']*deficit, ['']*deficit, ['']*deficit),
                columns=['cue','R1','R2','R3']
            )
            df = pd.concat([df, rows_to_add], ignore_index=True)

    # Add rows for cues completely missing from the dataset
    missingCues = list(set(unqCues) - set(df['cue']))
    if len(missingCues) > 0:
        for c in missingCues:
            rows_to_add = pd.DataFrame(
                zip([c]*100, ['']*100, ['']*100, ['']*100),
                columns=['cue','R1','R2','R3']
            )
            df = pd.concat([df, rows_to_add], ignore_index=True)
    return df

def NA2Blank(df1):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = [x if isinstance(x, str) else '' for x in df[col]]
    return df

def Lowercase(df1):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = [x.lower() for x in df[col]]
    return df

def RemoveUnderscore(df1):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = [x.replace('_',' ') for x in df[col]]
    return df

def RemoveRespArticles(df1, unqCues):
    df = df1.copy()
    for col in ['R1','R2','R3']:
        for prefix in ['a ','an ','the ','to ']:
            mask = (df[col].str.startswith(prefix)) & (~df[col].isin(unqCues))
            df.loc[mask, col] = df.loc[mask, col].str[len(prefix):]
    return df

def AddSpaceOrHyphen(df1, missingDict):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = df[col].map(missingDict).fillna(df[col])
    return df

def Spelling(df1, spelling_dict):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = df[col].map(spelling_dict).fillna(df[col])
    return df

def Lemmatization(df1):
    df = df1.copy()
    for col in ['cue','R1','R2','R3']:
        df[col] = [lemmatizer.lemmatize(x) for x in df[col]]
        df[col] = [x.replace('men','man') for x in df[col]]
        df[col] = [x.replace('hands','hand') for x in df[col]]
    return df

def RemoveCueResp(df1):
    df = df1.copy()
    for col in ['R1','R2','R3']:
        df[col] = np.where(df[col] == df['cue'], '', df[col])
    return df

def RemoveDupeResp(df1):
    df = df1.copy()
    df['R3'] = np.where((df['R3'] == df['R1']) | (df['R3'] == df['R2']), '', df['R3'])
    df['R2'] = np.where(df['R2'] == df['R1'], '', df['R2'])
    return df

def ShiftResp(df1):
    df = df1.copy()
    # _ _ X -> X _ _
    df['R1'] = np.where((df['R1']== '') & (df['R2']== '') & (df['R3']!=''), df['R3'], df['R1'])
    df['R3'] = np.where(df['R1'] == df['R3'], '', df['R3'])
    # _ X _ -> X _ _
    df['R1'] = np.where((df['R1']== '') & (df['R2']!='') & (df['R3']==''), df['R2'], df['R1'])
    df['R2'] = np.where(df['R1'] == df['R2'], '', df['R2'])
    # _ X X -> X _ X
    df['R1'] = np.where((df['R1']== '') & (df['R2']!='') & (df['R3']!=''), df['R2'], df['R1'])
    df['R2'] = np.where(df['R1'] == df['R2'], '', df['R2'])
    # X _ X -> X X _
    df['R2'] = np.where((df['R1']!='') & (df['R2']== '') & (df['R3']!=''), df['R3'], df['R2'])
    df['R3'] = np.where(df['R2'] == df['R3'], '', df['R3'])
    return df

def SortColumns(df1):
    df = df1.copy()
    df = df[['cue','R1','R2','R3']]
    df = df.sort_values(by=['cue','R1','R2','R3'])
    return df

def cleaningPipeline(df1, unqCues, missingDict, spelling_dict, name):
    df = df1.copy()
    df = NA2Blank(df)
    df = Lowercase(df)
    df = RemoveUnderscore(df)
    df = RemoveRespArticles(df, unqCues)
    df = AddSpaceOrHyphen(df, missingDict)
    df = Spelling(df, spelling_dict)
    df = Lemmatization(df)
    df = cue100(df, unqCues)
    df = RemoveCueResp(df)
    df = RemoveDupeResp(df)
    df = ShiftResp(df)
    df = SortColumns(df)
    return df


In [None]:
# 1) Load spelling lookup
spelling_path = P('data', 'intermediate_preprocess_dataset_using_LWOW_code', 'mapping_tables', 'EnglishCustomDict.txt')
spelling = loadTextFile(spelling_path)
spelling_dict = {a.lower(): b.lower() for [a, b] in spelling}

# 2) Load SWOW (Humans only) and build cue set
FA_SWOW = loadSimplifiedSWOW()
FA_SWOW = FA_SWOW[FA_SWOW['cue'] != 'nan']  # keep literal 'nan' strings out

OrigCues = FA_SWOW['cue']
OrigCues = [x for x in OrigCues if isinstance(x, str)]      # remove non-strings
OrigCues = [x.lower() for x in OrigCues]                    # lowercase
OrigCues = [spelling_dict.get(x, x) for x in OrigCues]      # spelling normalize
OrigCues = [lemmatizer.lemmatize(x) if x != 'men' else 'man' for x in OrigCues]  # lemmatize + men→man
unqCues = list(set(OrigCues))

# 3) Build missingDict (spaces/hyphens) from WordNet
wnWords = []
for syn in wn.all_synsets():
    wnWords.append([str(lemma.name()) for lemma in syn.lemmas()])
wnWordsFlat = [item for sub in wnWords for item in sub]
wnWordsLower = [x.lower() for x in wnWordsFlat]

noSpacesDict   = {x.replace('_',''): x.replace('_',' ') for x in wnWordsLower}
noHyphensDict  = {x.replace('-',''): x for x in wnWordsLower}
onlyNoSpaces   = list(set(noSpacesDict.keys())  - set(wnWordsLower))
onlyNoHyphens  = list(set(noHyphensDict.keys()) - set(wnWordsLower))
onlyNoSpacesDict  = {x: noSpacesDict[x]  for x in onlyNoSpaces}
onlyNoHyphensDict = {x: noHyphensDict[x] for x in onlyNoHyphens}
missingDict = onlyNoSpacesDict.copy()
missingDict.update(onlyNoHyphensDict)

# 4) Clean Humans (SWOW) only and save CSV
clean_humans = cleaningPipeline(FA_SWOW, unqCues, missingDict, spelling_dict, name='Humans')

out_path = P('data', 'intermediate_preprocess_dataset_using_LWOW_code', 'FA_Humans.csv')
clean_humans.to_csv(out_path, index=False)

print(f"Saved Humans-only cleaned CSV to:\n  {out_path}")
print(f"Cleaned shape: {clean_humans.shape}")


Saved Humans-only cleaned CSV to:
  /content/drive/MyDrive/ANLP_project_final/data/intermediate_preprocess_dataset_using_LWOW_code/FA_Humans.csv
Cleaned shape: (1154600, 4)
