In [None]:
!pip install -q \
    "numpy==2.2.6" \
    "pandas==2.3.1" \
    "openpyxl==3.1.5" \
    "et-xmlfile==2.0.0" \
    "python-dateutil==2.9.0.post0" \
    "pytz==2025.2" \
    "tzdata==2025.2" \
    "six==1.17.0"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m122.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m137.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.1 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but yo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login

import os
# --------------- Hugging Face token ---------------
os.environ["HF_TOKEN"] = "YOUR_TOKEN_HERE"
login(os.environ["HF_TOKEN"])


Mounted at /content/drive


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import os
import pandas as pd
import numpy as np
import random

SEED = 42


def process(file_path: str, output_xlsx: str = None, seed: int = SEED) -> None:
    # ---- Reproducibility ----
    np.random.seed(seed)
    random.seed(seed)

    # ---- Load CSV (keep defaults to mirror your PyCharm run) ----
    df = pd.read_csv(file_path)

    # ---- Lowercase cue and response columns ----
    for col in ["cue", "R1", "R2", "R3"]:
        # use .str only if the column exists; match your original intent
        if col in df.columns:
            df[col] = df[col].str.lower()

    # ---- Original stats ----
    original_row_count = len(df)
    original_unique_cues = df["cue"].nunique()

    # ---- Drop rows with any missing R1/R2/R3 ----
    cleaned_df = df.dropna(subset=["R1", "R2", "R3"])
    new_row_count = len(cleaned_df)
    new_unique_cues = cleaned_df["cue"].nunique()

    # ---- Unique responses across R1–R3 ----
    unique_responses = pd.unique(cleaned_df[["R1", "R2", "R3"]].values.ravel())
    num_unique_responses = len(unique_responses)

    # ---- Stats on cue frequencies ----
    cue_counts = cleaned_df["cue"].value_counts()
    min_per_cue = cue_counts.min()
    max_per_cue = cue_counts.max()
    mean_per_cue = cue_counts.mean()
    median_per_cue = cue_counts.median()

    # ---- Percentage of rows removed ----
    percent_removed = 100 * (original_row_count - new_row_count) / original_row_count

    # ---- Print main stats (kept your original print messages) ----
    print(f"Cleaned data saved to: {output_xlsx}")
    print(f"Original number of different cues: {original_unique_cues}")
    print(f"New number of different cues: {new_unique_cues}")
    print(f"Original number of rows: {original_row_count}")
    print(f"New number of rows: {new_row_count}")
    print(f"Percentage of rows removed: {percent_removed:.2f}%")
    print(f"Number of different responses (R1, R2, R3): {num_unique_responses}")
    print(f"Rows per cue - min: {min_per_cue}, max: {max_per_cue}, mean: {mean_per_cue:.2f}, median: {median_per_cue}")

    print("\nCues with at least N rows:")
    for threshold in range(10, 101, 10):
        count = (cue_counts >= threshold).sum()
        print(f" ≥ {threshold}: {count} cues")

    # ---- Filter to cues with at least 80 rows ----
    valid_cues = cue_counts[cue_counts >= 80].index
    filtered_df = cleaned_df[cleaned_df["cue"].isin(valid_cues)]

    # ---- Sample exactly 80 rows per cue (deterministic) ----
    sampled_df = (
        filtered_df.groupby("cue", group_keys=False)
        .sample(n=80, random_state=seed)
        .reset_index(drop=True)
    )
    # Lowercase everything in the final table (like your original)
    sampled_df = sampled_df.map(lambda x: str(x).lower() if not pd.isna(x) else x)

    # ---- Save outputs ----
    if output_xlsx is not None:
        os.makedirs(os.path.dirname(output_xlsx), exist_ok=True)
        sampled_df.to_excel(output_xlsx, index=False)  # openpyxl backend (pinned above)
    print(f"\nDownsampled dataset (80 rows per cue) saved to: {output_xlsx}")

    # ---- Downsampled stats ----
    downsampled_row_count = len(sampled_df)
    downsampled_unique_cues = sampled_df["cue"].nunique()
    downsampled_unique_responses = pd.unique(sampled_df[["R1", "R2", "R3"]].values.ravel())
    num_downsampled_unique_responses = len(downsampled_unique_responses)
    downsampled_cue_counts = sampled_df["cue"].value_counts()
    min_down = downsampled_cue_counts.min()
    max_down = downsampled_cue_counts.max()
    mean_down = downsampled_cue_counts.mean()
    median_down = downsampled_cue_counts.median()

    print(f"Downsampled number of different cues: {downsampled_unique_cues}")
    print(f"Downsampled number of rows: {downsampled_row_count}")
    print(f"Number of different responses (R1, R2, R3): {num_downsampled_unique_responses}")
    print(f"Rows per cue - min: {min_down}, max: {max_down}, mean: {mean_down:.2f}, median: {median_down}")


In [None]:
BASE_PATH = r"/content/drive/My Drive/associations-ANLP"

INPUT_CSV   = os.path.join(BASE_PATH, r"data/intermediate_preprocess_dataset_using_LWOW_code/FA_Humans.csv")
OUTPUT_XLSX = os.path.join(BASE_PATH, r"data/final_processed_SWOW_data/cleaned_data_FA_Humans.xlsx")

process(INPUT_CSV, OUTPUT_XLSX)

✅ Cleaned data saved to: /content/drive/MyDrive/ANLP_project_final/data/final_processed_SWOW_data/cleaned_data_FA_Humans.xlsx
Original number of different cues: 11546
New number of different cues: 11393
Original number of rows: 1154600
New number of rows: 976223
Percentage of rows removed: 15.45%
Number of different responses (R1, R2, R3): 110485
Rows per cue - min: 32, max: 100, mean: 85.69, median: 88.0

📊 Cues with at least N rows:
 ≥ 10: 11393 cues
 ≥ 20: 11393 cues
 ≥ 30: 11393 cues
 ≥ 40: 11388 cues
 ≥ 50: 11368 cues
 ≥ 60: 11277 cues
 ≥ 70: 10834 cues
 ≥ 80: 8992 cues
 ≥ 90: 4468 cues
 ≥ 100: 13 cues

✅ Downsampled dataset (80 rows per cue) saved to: /content/drive/MyDrive/ANLP_project_final/data/final_processed_SWOW_data/cleaned_data_FA_Humans.xlsx
Downsampled number of different cues: 8992
Downsampled number of rows: 719360
Number of different responses (R1, R2, R3): 89815
Rows per cue - min: 80, max: 80, mean: 80.00, median: 80.0
