# Filtering based on initial research + additions

In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('ous_data/data_preprocessed.csv')


## Filtering

In [15]:
# Exclusionary criteria steps

# 1. Complete responses for IB and IH questions
# First, get all IB and IH columns
ib_ih_cols = [col for col in df.columns if col.startswith(('IB', 'IH'))]
ous_complete = df.dropna(subset=ib_ih_cols)

# 2. Filter for correct careless check answers
ous_correct = ous_complete[
    (ous_complete['careless_1'] == 2) &
    (ous_complete['careless_2'] == 2) &
    (ous_complete['careless_3'] == 1)
]

# 3. Not confused by materials
ous_confuse = ous_correct[
    ous_correct['confusion'].notna() &
    (ous_correct['confusion'] < 3)
]

# 4. No technical problems
ous_technical = ous_confuse[
    ous_confuse['technical_problems'] == 1
]

# 5. Native language speakers
ous_native = ous_technical[
    ous_technical['native_language'] == 1
]

# Filter for languages that met configural invariance
valid_languages = [
    "EN", "FR", "IT", "RU", "SK", "TR",
    "ES-ARG", "CHI", "COL-ESP", "ES-EC", "ESP-ESP",
    "ES-MX", "ES-PE", "ESP-SAL", "ES-ES"
]

ous_align = ous_native[ous_native['Q_Lang'].isin(valid_languages)].copy()

# Create language variable with numeric coding
language_mapping = {
    "EN": 1,
    "FR": 2,
    "IT": 3,
    "RU": 4,
    "SK": 5,
    "TR": 7,
    # All Spanish variants mapped to 6
    "ES-ARG": 6, "CHI": 6, "COL-ESP": 6, "ES-EC": 6,
    "ESP-ESP": 6, "ES-MX": 6, "ES-PE": 6, "ESP-SAL": 6,
    "ES-ES": 6
}

ous_align['lang'] = ous_align['Q_Lang'].map(language_mapping)

# ous_align['continent'] = 0


# Select final variables
final_cols = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5', 
              'IH1', 'IH2', 'IH3', 'IH4', 'lang'] #, 'continent']
ous_align = ous_align[final_cols]

## Export to csv

In [16]:
# Optional: Print sample sizes at each step for verification
print(f"Initial sample size: {len(df)}")
print(f"Complete responses: {len(ous_complete)}")
print(f"Correct careless checks: {len(ous_correct)}")
print(f"Not confused: {len(ous_confuse)}")
print(f"No technical problems: {len(ous_technical)}")
print(f"Native speakers: {len(ous_native)}")
print(f"Final aligned sample: {len(ous_align)}")

# Print language distribution
print("\nLanguage distribution:")
print(ous_align['lang'].value_counts().sort_index())

# Write to CSV without headers and index
ous_align.to_csv('ous_data/ous_align2.csv', 
                 index=False, 
                 header=True)

Initial sample size: 27590
Complete responses: 27590
Correct careless checks: 27067
Not confused: 24092
No technical problems: 21988
Native speakers: 20579
Final aligned sample: 10677

Language distribution:
lang
1    6325
2    1096
3     401
4     414
5     461
6     869
7    1111
Name: count, dtype: int64
