In [22]:
"""
to be deleted when everything is done, I'm just using it to the check the data types and the nan values in all csv files
+ print the markdown table + some other checks
"""
import json
from pathlib import Path

import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

PATHS = [
    # 'stimuli/aoi_texts/aoi/b0.ias',
    'stimuli/word_features/word_features_b0.csv',
    'participants/participant_data.csv',
    'eyetracking_data/fixations/reader0_b0_fixations.csv',
    'eyetracking_data/reader_rm_wf/reader0_b0_merged.csv',
    'eyetracking_data/reading_measures/reader0_b0_rm.csv',
    'eyetracking_data/scanpaths/reader0_b0_scanpath.csv',
    'eyetracking_data/scanpaths_reader_rm_wf/reader0_b0_merged_sp_rm.csv',
]

PATHS_FOLDERS = [
    'stimuli/aoi_texts/aoi/',
    'stimuli/word_features/',
    'eyetracking_data/fixations/',
    'eyetracking_data/reader_rm_wf/',
    'eyetracking_data/reading_measures/',
    'eyetracking_data/scanpaths/',
    'eyetracking_data/scanpaths_reader_rm_wf/',
    'stimuli/texts_and_questions/',
    'participants/',
    
]

floats = ['type_length_chars', 
          'lemma_length_chars', 
                     'type_length_syllables', 
    'annotated_type_frequency_normalized', 'type_frequency_normalized',
                     'lemma_frequency_normalized', 'familiarity_normalized', 'regularity_normalized',
                     'document_frequency_normalized', 'sentence_frequency_normalized',
                     'cumulative_syllable_corpus_frequency_normalized',
                     'cumulative_syllable_lexicon_frequency_normalized',
                     'cumulative_character_corpus_frequency_normalized',
                     'cumulative_character_lexicon_frequency_normalized',
                     'cumulative_character_bigram_corpus_frequency_normalized',
                     'cumulative_character_bigram_lexicon_frequency_normalized',
                     'cumulative_character_trigram_corpus_frequency_normalized',
                     'cumulative_character_trigram_lexicon_frequency_normalized',
                     'initial_letter_frequency_normalized',
                     'initial_bigram_frequency_normalized', 'initial_trigram_frequency_normalized',
                     'avg_cond_prob_in_bigrams',
                     'avg_cond_prob_in_trigrams', 'neighbors_coltheart_higher_freq_cum_freq_normalized',
                     'neighbors_coltheart_higher_freq_count_normalized',
                     'neighbors_coltheart_all_cum_freq_normalized',
                     'neighbors_coltheart_all_count_normalized',
                     'neighbors_levenshtein_higher_freq_cum_freq_normalized',
                     'neighbors_levenshtein_higher_freq_count_normalized',
                     'neighbors_levenshtein_all_cum_freq_normalized',
                     'neighbors_levenshtein_all_count_normalized']

In [29]:
# iterate over all folders
all_cols = set()
    
for folder in PATHS_FOLDERS:
    # iterate over all tsv files in all folders
    
    cols = {}
    
    suffix = '.ias' if folder == 'stimuli/aoi_texts/aoi/' else '.tsv'
    for file in Path(folder).glob(f'*{suffix}'):

        tsv = pd.read_csv(file, sep='\t')
        nans = tsv.isnull().sum()
        
        all_cols.update(tsv.columns)
        
        for c in tsv.columns:
            try:
                cols[c]['values'] += tsv[c].tolist()
                cols[c]['dtypes'] += [tsv[c].dtype]
            except KeyError:
                cols[c] = {'values': tsv[c].tolist()}
                cols[c]['dtypes'] = [tsv[c].dtype] 
    
    for k, v in cols.items():
        v['poss_values'] = []
        v['missing_values'] = nans[k] if k in nans else 0
        
        v['dtypes'] = list(set(v['dtypes']))
        if len(v['dtypes']) > 1:
            v['poss_values'].append('more than one dtype: ' + str(v['dtypes']))
        
        elif 'object' in v['dtypes'] or '0' in v['dtypes']:
            # if set of values is smaller than 10, print the values, otherwise print object
            value_set = set(v['values'])
            if len(value_set) < 10:
                v['poss_values'].append(value_set)
            else:
                v['poss_values'].append('object')
        
        elif 'float64' in v['dtypes']:
            v['poss_values'].append(f"min: {min(v['values'])}, max: {max(v['values'])}, mean: {pd.Series(v['values']).mean()}, std: {pd.Series(v['values']).std()}")
        
        elif 'int64' in v['dtypes']:
            value_set = set(v['values'])
            if len(value_set) < 10:
                v['poss_values'].append(value_set)
            else:
                v['poss_values'].append(f"Interval: {min(v['values'])}-{max(v['values'])}")
        
        else:
            v['poss_values'].append(v['dtypes'])


    df_lists = {'Column name': [], 'Possible values': [], 'Missing value': [], 'Description': [], 'Source': []}
    for k, v in cols.items():
        df_lists['Column name'].append(k)
        df_lists['Possible values'].append(v['poss_values'] if len(v['poss_values']) > 1 else v['poss_values'][0])
        df_lists['Missing value'].append(f"Number of missing values: {v['missing_values']}")
        df_lists['Description'].append(pd.NA)
        df_lists['Source'].append(pd.NA)
    
    df = pd.DataFrame(df_lists)
    df.to_csv(f'codebook_tables/{Path(folder).stem}.tsv', sep='\t', index=False)


In [None]:
## investigate col values and distribution

values = {}
for folder in PATHS_FOLDERS:
    values[folder] = {}

    first = True
    suffix = '.ias' if folder == 'stimuli/aoi_texts/aoi/' else '.tsv'
    for path in tqdm(Path(folder).glob(f'*{suffix}'), desc=f'Checking files in {folder}'):

        csv = pd.read_csv(path, sep='\t', keep_default_na=False,
                          na_values=['#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND',
                                     '1.#QNAN', '<NA>', 'N/A', 'NA', 'NaN', 'None', 'n/a', 'nan', ''])
        if first:
            all_files = csv
            first = False
        else:
            all_files = pd.concat([all_files, csv], ignore_index=True)

    print(len(all_files))
    cols = all_files.columns.tolist()
    for col in all_files.columns:
        try:
            values[folder][col] += all_files[col].tolist()
        except KeyError:
            values[folder][col] = all_files[col].tolist()

        if all_files[col].dtype in ['float64'] or col in floats:
            print(col)
            cols.remove(col)
            try:
                print(f'min: {all_files[col].min()} max: {all_files[col].max()} mean: {all_files[col].mean()} std: {all_files[col].std()}')
                all_files[col].astype('float64').plot.kde()
                plt.show()

            except ValueError:
                # print value if value if not numeric
                print([v for v in all_files[col].values if not isinstance(v, float)])
        
        elif all_files[col].dtype in ['int64']:
            print(col)
            cols.remove(col)            
            counts = all_files[col].value_counts()
            print(counts.to_dict())
            all_files[col].plot.hist()
            plt.show()

    print(cols)
    print(all_files.dtypes)
    print(all_files['type_length_chars'].value_counts())

{'FD',
 'FFD',
 'FPF',
 'FPRT',
 'FPReg',
 'FRT',
 'Fix',
 'LP',
 'PoS_tag',
 'RBRT',
 'RPD_exc',
 'RPD_inc',
 'RR',
 'RRT',
 'SFD',
 'SL_in',
 'SL_out',
 'STTS_PoS_tag',
 'STTS_punctuation_after',
 'STTS_punctuation_before',
 'TFT',
 'TRC_in',
 'TRC_out',
 'acc_bq_1',
 'acc_bq_2',
 'acc_bq_3',
 'acc_tq_1',
 'acc_tq_2',
 'acc_tq_3',
 'age',
 'alcohol',
 'annotated_type_frequency_normalized',
 'avg_cond_prob_in_bigrams',
 'avg_cond_prob_in_trigrams',
 'bq_1',
 'bq_1_option1',
 'bq_1_option2',
 'bq_1_option3',
 'bq_1_option4',
 'bq_2',
 'bq_2_option1',
 'bq_2_option2',
 'bq_2_option3',
 'bq_2_option4',
 'bq_3',
 'bq_3_option1',
 'bq_3_option2',
 'bq_3_option3',
 'bq_3_option4',
 'char_index_in_line',
 'char_index_in_text',
 'character',
 'contains_abbreviation',
 'contains_hyphen',
 'contains_symbol',
 'correct_ans_bq_1',
 'correct_ans_bq_2',
 'correct_ans_bq_3',
 'correct_ans_tq_1',
 'correct_ans_tq_2',
 'correct_ans_tq_3',
 'cumulative_character_bigram_corpus_frequency_normalized',
 'c

In [45]:
## rename col names for all files

# load the mapping as csv and convert to dict with old value as key and new value as value
mapping = pd.read_csv('new_col_mapping.csv')
mapping = mapping.fillna('')
mapping = dict(zip(mapping['Actual name'], mapping['New name']))

for folder in PATHS_FOLDERS:
    for path in tqdm(Path(folder).glob('*.csv'), desc=f'Renaming files in {folder}'):

        csv = pd.read_csv(path, sep='\t', keep_default_na=False,
                          na_values=['#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND',
                                     '1.#QNAN', '<NA>', 'N/A', 'NA', 'NaN', 'None', 'n/a', 'nan', ''])

        rename = True
        # for col in csv.columns:
        #     try:
        #         new_name = mapping[col]
        #         new_name = col.lower() if not new_name else new_name
        # 
        #         csv.rename(columns={col: new_name}, inplace=True)
        #         rename = True
        # 
        #     except KeyError:
        #         continue

        if rename:
            path.rename(path.with_suffix('.tsv'))
            csv.to_csv(path, sep='\t', index=False)

Renaming files in stimuli/word_features/: 12it [00:00, 159.50it/s]
Renaming files in eyetracking_data/fixations/: 900it [00:01, 460.42it/s]
Renaming files in eyetracking_data/reader_rm_wf/: 900it [00:04, 182.18it/s]
Renaming files in eyetracking_data/reading_measures/: 900it [00:01, 473.14it/s]
Renaming files in eyetracking_data/scanpaths/: 900it [00:02, 372.90it/s]
Renaming files in eyetracking_data/scanpaths_reader_rm_wf/: 900it [00:11, 79.41it/s]


In [None]:
## check duplicate columns
    
all_paths = [
    'stimuli/word_features',
]
cols1 = [
    'PoS_tag'

]
cols2 = [
    'STTS_PoS_tag'
]
    
for path, col1, col2 in zip(all_paths, cols1, cols2):

    files = Path(path).glob('*.csv')

    with open(f'stuff_to_check/mismatch_{Path(path).stem}_{col1}_{col2}.txt', 'w', encoding='utf8') as f:

        for path in files:

            csv = pd.read_csv(path, sep='\t', na_filter=False)

            c1, c2, word = csv[col1], csv[col2], csv['word']

            for idx, (v1, v2, w) in enumerate(zip(c1, c2, word)):
                if v1.lower() != v2.lower():
                    f.write(f'{path}\n')
                    f.write(f'Line: {idx + 1}\n')
                    f.write(f'{col1}: {v1}\t{col2}: {v2}\tword:{w}\n\n')

In [26]:
## merge text tags and word features

for tag_path in tqdm(Path('stimuli/OLD/text_tags/').glob('*.csv'), desc='Merging text tags and word features'):
    filename = tag_path.stem
    word_features = pd.read_csv(f'stimuli/OLD/word_features/word_features_{filename}.csv', sep='\t')

    text_tags = pd.read_csv(tag_path, sep='\t')
    merged = pd.merge(word_features, text_tags, left_index=True, right_index=True)
    # rename text to word and drop word_y and word_x
    merged.rename({'text': 'word'}, axis=1, inplace=True)
    merged.drop(columns=['word_y', 'word_x', 'line_index', 'PRELS', 'PRELAT', 'PPOSAT', 'PPER', 'PPOSS', 'PIDAT',
                         'PIAT', 'PIS', 'PDAT', 'PDS', 'NN', 'NE', 'KOKOM',
                         'KON', 'KOUS', 'KOUI', 'ITJ', 'FM', 'CARD', 'ART', 'APZR', 'APPO', 'APPRART', 'APPR',
                         'ADV', 'ADJD', 'ADJA',
                         'XY', 'VMPP', 'VMINF', 'VMFIN', 'VAPP', 'VAFIN', 'VVPP', 'VVIZU', 'VVINF', 'VVIMP',
                         'VVFIN', 'TRUNC', 'PTKA',
                         'PTKANT', 'PTKVZ', 'PTKNEG', 'PTKZU', 'PAV', 'PWAV', 'PWAT', 'PWS', 'PRF', 'STTS-tag'],
                inplace=True)

    assert (len(merged) == len(word_features) == len(text_tags))

    merged = merged[['word', 'word_index_in_text', 'word_index_in_sent', 'sent_index_in_text',
                     'word_limit_char_indices', 'text_id', 'is_technical_term',
                     'word_length',
                     'STTS_punctuation_before',
                     'STTS_punctuation_after', 'is_in_quote',
                     'is_in_parentheses', 'is_clause_beginning', 'is_sent_beginning', 'is_abbreviation',
                     'contains_symbol', 'contains_hyphen',
                     'contains_abbreviation', 'STTS_PoS_tag',
                     'type', 'type_length_chars', 'PoS_tag', 'lemma', 'lemma_length_chars', 'syllables',
                     'type_length_syllables', 'annotated_type_frequency_normalized', 'type_frequency_normalized',
                     'lemma_frequency_normalized', 'familiarity_normalized', 'regularity_normalized',
                     'document_frequency_normalized', 'sentence_frequency_normalized',
                     'cumulative_syllable_corpus_frequency_normalized',
                     'cumulative_syllable_lexicon_frequency_normalized',
                     'cumulative_character_corpus_frequency_normalized',
                     'cumulative_character_lexicon_frequency_normalized',
                     'cumulative_character_bigram_corpus_frequency_normalized',
                     'cumulative_character_bigram_lexicon_frequency_normalized',
                     'cumulative_character_trigram_corpus_frequency_normalized',
                     'cumulative_character_trigram_lexicon_frequency_normalized',
                     'initial_letter_frequency_normalized',
                     'initial_bigram_frequency_normalized', 'initial_trigram_frequency_normalized',
                     'avg_cond_prob_in_bigrams',
                     'avg_cond_prob_in_trigrams', 'neighbors_coltheart_higher_freq_cum_freq_normalized',
                     'neighbors_coltheart_higher_freq_count_normalized',
                     'neighbors_coltheart_all_cum_freq_normalized',
                     'neighbors_coltheart_all_count_normalized',
                     'neighbors_levenshtein_higher_freq_cum_freq_normalized',
                     'neighbors_levenshtein_higher_freq_count_normalized',
                     'neighbors_levenshtein_all_cum_freq_normalized',
                     'neighbors_levenshtein_all_count_normalized'
                     ]]
    
    floats = ['type_length_chars', 'lemma_length_chars', 
                     'type_length_syllables', 'annotated_type_frequency_normalized', 'type_frequency_normalized',
                     'lemma_frequency_normalized', 'familiarity_normalized', 'regularity_normalized',
                     'document_frequency_normalized', 'sentence_frequency_normalized',
                     'cumulative_syllable_corpus_frequency_normalized',
                     'cumulative_syllable_lexicon_frequency_normalized',
                     'cumulative_character_corpus_frequency_normalized',
                     'cumulative_character_lexicon_frequency_normalized',
                     'cumulative_character_bigram_corpus_frequency_normalized',
                     'cumulative_character_bigram_lexicon_frequency_normalized',
                     'cumulative_character_trigram_corpus_frequency_normalized',
                     'cumulative_character_trigram_lexicon_frequency_normalized',
                     'initial_letter_frequency_normalized',
                     'initial_bigram_frequency_normalized', 'initial_trigram_frequency_normalized',
                     'avg_cond_prob_in_bigrams',
                     'avg_cond_prob_in_trigrams', 'neighbors_coltheart_higher_freq_cum_freq_normalized',
                     'neighbors_coltheart_higher_freq_count_normalized',
                     'neighbors_coltheart_all_cum_freq_normalized',
                     'neighbors_coltheart_all_count_normalized',
                     'neighbors_levenshtein_higher_freq_cum_freq_normalized',
                     'neighbors_levenshtein_higher_freq_count_normalized',
                     'neighbors_levenshtein_all_cum_freq_normalized',
                     'neighbors_levenshtein_all_count_normalized']
    
    
    for fl in floats:
        merged[fl] = merged[fl].astype('float64')
    
    print(merged.dtypes)

    merged.to_csv(f'stimuli/word_features/word_features_{filename}.csv', sep='\t', index=False)

Merging text tags and word features: 12it [00:00, 75.31it/s]

word                                                          object
word_index_in_text                                             int64
word_index_in_sent                                             int64
sent_index_in_text                                             int64
word_limit_char_indices                                       object
text_id                                                       object
is_technical_term                                              int64
word_length                                                    int64
STTS_punctuation_before                                       object
STTS_punctuation_after                                        object
is_in_quote                                                    int64
is_in_parentheses                                              int64
is_clause_beginning                                            int64
is_sent_beginning                                              int64
is_abbreviation                   




In [None]:
part = pd.read_csv('participants/participant_data.csv', sep=',')
part.to_csv('participants/participant_data.csv', sep='\t', index=False)