# Filtering based on initial research + additions

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('ous_data/data_preprocessed.csv')


## Filtering

In [None]:
# Exclusionary criteria steps

# 1. Complete responses for IB and IH questions
# First, get all IB and IH columns
ib_ih_cols = [col for col in df.columns if col.startswith(('IB', 'IH'))]
ous_complete = df.dropna(subset=ib_ih_cols)

# 2. Filter for correct careless check answers
ous_correct = ous_complete[
    (ous_complete['careless_1'] == 2) &
    (ous_complete['careless_2'] == 2) &
    (ous_complete['careless_3'] == 1)
]

# 3. Not confused by materials
ous_confuse = ous_correct[
    ous_correct['confusion'].notna() &
    (ous_correct['confusion'] < 3)
]

# 4. No technical problems
ous_technical = ous_confuse[
    ous_confuse['technical_problems'] == 1
]

# 5. Native language speakers
ous_native = ous_technical[
    ous_technical['native_language'] == 1
]

# Filter for languages that met configural invariance
valid_languages = [
    "EN", "FR", "IT", "RU", "SK", "TR",
    "ES-ARG", "CHI", "COL-ESP", "ES-EC", "ESP-ESP",
    "ES-MX", "ES-PE", "ESP-SAL", "ES-ES"
]

ous_align = ous_native[ous_native['Q_Lang'].isin(valid_languages)].copy()

# Create language variable with numeric coding
language_mapping = {
    "EN": 1,
    "FR": 2,
    "IT": 3,
    "RU": 4,
    "SK": 5,
    "TR": 7,
    # All Spanish variants mapped to 6
    "ES-ARG": 6, "CHI": 6, "COL-ESP": 6, "ES-EC": 6,
    "ESP-ESP": 6, "ES-MX": 6, "ES-PE": 6, "ESP-SAL": 6,
    "ES-ES": 6
}

ous_align['lang'] = ous_align['Q_Lang'].map(language_mapping)



In [None]:
# Dictionary mapping
country_to_continent = {
    'IND': 'Asia',
    'CHN': 'Asia',
    'USA': 'North America',
    'CAN': 'North America',
    'GBR': 'Europe',
    'FRA': 'Europe',
    'ARE': 'Asia',
    'MYS': 'Asia', 
    'LBN': 'Asia', 
    'THA': 'Asia', 
    'MKD': 'Europe', 
    'PAK': 'Asia', 
    'IRN': 'Asia',
    'JPN': 'Asia',  
    'HUN': 'Europe', 
    'COL': 'South America', 
    'ARG': 'South America', 
    'SVK': 'Europe', 
    'TUR': 'Asia', 
    'ECU': 'South America', 
    'CHL': 'South America',
    'CZE': 'Europe', 
    'PER': 'South America', 
    'PHL': 'Asia', 
    'MEX': 'North America', 
    'SRB': 'Europe', 
    'RUS': 'Asia', 
    'DEU': 'Europe',
    'AUT': 'Europe', 
    'POL': 'Europe', 
    'DNK': 'Europe', 
    'ITA': 'Europe', 
    'AUS': 'Oceania', 
    'PRT': 'Europe', 
    'KAZ': 'Asia', 
    'GRC': 'Europe',
    'ESP': 'Europe', 
    'BGR': 'Europe', 
    'NZL': 'Oceania', 
    'BRA': 'South America', 
    'NLD': 'Europe', 
    'HRV': 'Europe', 
    'ROU': 'Europe', 
    'CHE': 'Europe', 
    'SGP': 'Asia'
}

ous_align['education_level'] = df['education_leve']
ous_align['sex'] = df['sex']
ous_align['country_origin'] = df['countr_origin_1']
ous_align['country'] = df['country3']

# Create new continent column based on country codes
ous_align['continent'] = df['country3'].map(country_to_continent)

ous_align['age'] = df['Age']

# Select final variables
final_cols = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5', 
              'IH1', 'IH2', 'IH3', 'IH4', 'lang', 'education_level', 'sex', 'country_origin', 'country', 'age', 'continent']
ous_align = ous_align[final_cols]


## Export to csv

In [None]:
# Optional: Print sample sizes at each step for verification
print(f"Initial sample size: {len(df)}")
print(f"Complete responses: {len(ous_complete)}")
print(f"Correct careless checks: {len(ous_correct)}")
print(f"Not confused: {len(ous_confuse)}")
print(f"No technical problems: {len(ous_technical)}")
print(f"Native speakers: {len(ous_native)}")
print(f"Final aligned sample: {len(ous_align)}")

# Print language distribution
print("\nLanguage distribution:")
print(ous_align['lang'].value_counts().sort_index())

# Write to CSV without headers and index
ous_align.to_csv('ous_data/ous_align2.csv', 
                 index=False, 
                 header=True)