# Formatting the LPP txt files

In this notebook, we'll go from the raw LPP txt files, to a word-based csv file, doing the following steps:
- Tokenizing the natural language by words
- Remove the blank space between a word and :
- Adding capital letters at the beginning of a sentence
- Remove the blank space between - and the following word (dialogue) 

Different versions will be built:

- A first one with 300 ms words + 50 ms black screen. End of sentence delay of 200 ms

- A second one with 250 ms words + 50 ms black screen. End of sentence delay of 500 ms

### Bash commands preprocessing

In [None]:
# For :
!perl -pi.bak -e 's/ :/:/g' *.txt

# For dash
!perl -pi.bak -e 's/- /-/g' *.txt

# Word tokenizing
!for f in `seq 1 9` ; do sed 's/ /\n/g' text_french_run$f.txt | awk 'length($0) > 0 ' > new_test_run$f.txt; done


### Python commands tokenizing

In [4]:
import pandas as pd
import numpy as np
import copy
import string

## First version

In [9]:
black_screen = 0.05
word_duration = 0.30
word_bs = black_screen + word_duration

for i in np.arange(1,10):
    with open(f'./text_lpp/new_test_run{i}.txt') as temp_file:

        lpp = temp_file.read().splitlines() 


    df = pd.DataFrame(lpp)
    next_cap = False

    for index, row in df.iterrows():
        # First word
        if index == 0:
            df.at[index,0] = str(row.str.capitalize()[0])
        if next_cap == True:
            df.at[index,0] = str(row.str.capitalize()[0])
        if str(row).__contains__('.') or str(row).__contains__('?') or str(row).__contains__('!'):
            next_cap = True
        else:
            next_cap = False

    df.columns = ['word']
    end = (df.shape[0] * word_bs) + 0.7
    df['onset'] = np.arange(0.7, end, word_bs)
    df['duration'] = np.ones(df.shape[0]) * word_duration
    
    df.to_csv(f'./txt_clean/run{i}_clean.tsv', sep='\t', index=False)
    
    
    # Create a dataframe where the duration of the black screen after the end of the sentence is longer.

    df_sentence_end = pd.DataFrame(columns = df.columns, data = copy.deepcopy(df.values))
    end_of_sentence_delay = 0.2

    for index, row in df.iterrows():
        if str(row.word).__contains__('.') or str(row.word).__contains__('?') or str(row.word).__contains__('!'):
            # df_sentence_end.at[index, 'onset'] = row.onset + end_of_sentence_delay # Add the delay from this line
            # And for every next onset
            for j in np.arange(index+1, df.shape[0]):
                df_sentence_end.at[j, 'onset'] = df_sentence_end.at[j, 'onset'] + end_of_sentence_delay
                ww = df_sentence_end.at[j, 'word']

    
    df_sentence_end.to_csv(f'./v1/run{i}_v1_word_0.3_end_sentence_0.2.tsv',sep='\t',index=False)

In [10]:
# Create for decoding adding a dict
for i in np.arange(1,10):
    df_clean = pd.read_csv(f'./txt_clean/run{i}_clean.tsv',sep='\t')
    df_clean['trial_type'] = [{} for i in np.arange(df_clean.shape[0])]
    for index, row in df_clean.iterrows():
        clean_word = str(row.word).translate(str.maketrans('', '', string.punctuation))
        dict_word =  {'kind':'word','word':clean_word}
        df_clean.at[index, 'trial_type'] = dict_word
    df_clean.to_csv(f'./decoding_tsv_v1/run{i}_v1.tsv',sep='\t',index=False)

## Second version

In [12]:
black_screen = 0.05
word_duration = 0.25
end_of_sentence_delay = 0.5
word_bs = black_screen + word_duration
end_of_chapter_duration = 2

dict_end_chapter = {
    1:[433,1087],
    2:[737,1400],
    3:[710,1345],
    4:[1090,1357],
    5:[753],
    6:[716,951,1271,1555],
    7:[200,1278],
    8:[95,703],
    9:[1357],
}

for i in np.arange(1,10):
    with open(f'./text_lpp/new_test_run{i}.txt') as temp_file:

        lpp = temp_file.read().splitlines() 


    df = pd.DataFrame(lpp)
    next_cap = False

    for index, row in df.iterrows():
        # First word
        if index == 0:
            df.at[index,0] = str(row.str.capitalize()[0])
        if next_cap == True:
            df.at[index,0] = str(row.str.capitalize()[0])
        if str(row).__contains__('.') or str(row).__contains__('?') or str(row).__contains__('!'):
            next_cap = True
        else:
            next_cap = False

    df.columns = ['word']
    end = (df.shape[0] * word_bs) + 0.7
    df['onset'] = np.arange(0.7, end, word_bs)
    df['duration'] = np.ones(df.shape[0]) * word_duration
    
    
    
    df.to_csv(f'./txt_clean/run{i}_clean.tsv', sep='\t', index=False)
    
    
    # Create a dataframe where the duration of the black screen after the end of the sentence is longer.

    df_sentence_end = pd.DataFrame(columns = df.columns, data = copy.deepcopy(df.values))
    for index, row in df.iterrows():
        if str(row.word).__contains__('.') or str(row.word).__contains__('?') or str(row.word).__contains__('!'):
            # df_sentence_end.at[index, 'onset'] = row.onset + end_of_sentence_delay # Add the delay from this line
            # And for every next onset
            for j in np.arange(index+1, df.shape[0]):
                df_sentence_end.at[j, 'onset'] = df_sentence_end.at[j, 'onset'] + end_of_sentence_delay
                ww = df_sentence_end.at[j, 'word']
        if index+2 in dict_end_chapter[i]:
            print(f'Adding 2s after the word {row.word} \n')
            for j in np.arange(index+1, df.shape[0]):
                df_sentence_end.at[j, 'onset'] = df_sentence_end.at[j, 'onset'] + end_of_chapter_duration
                ww = df_sentence_end.at[j, 'word']

    
    df_sentence_end.to_csv(f'./v2/run{i}_v2_0.25_0.5.tsv',sep='\t',index=False)
    
# Create for decoding adding a dict
for i in np.arange(1,10):
    df_clean = pd.read_csv(f'./v2/run{i}_v2_0.25_0.5.tsv',sep='\t')
    df_clean['trial_type'] = [{} for i in np.arange(df_clean.shape[0])]
    for index, row in df_clean.iterrows():
        clean_word = str(row.word).translate(str.maketrans('', '', string.punctuation))
        dict_word =  {'kind':'word','word':clean_word}
        df_clean.at[index, 'trial_type'] = dict_word
    df_clean.to_csv(f'./decoding_tsv_v2/run{i}_v2.tsv',sep='\t',index=False)

In [8]:
black_screen = 0.05
word_duration = 0.25
end_of_sentence_delay = 0.5
word_bs = black_screen + word_duration
end_of_chapter_duration = 2

dict_end_chapter = {
    1:[433,1087],
    2:[737,1400],
    3:[710,1345],
    4:[1090,1357],
    5:[753],
    6:[716,951,1271,1555],
    7:[200,1278],
    8:[95,703],
    9:[1357]
}

for i in np.arange(1,10):
    with open(f'./text_lpp/new_test_run{i}.txt') as temp_file:

        lpp = temp_file.read().splitlines() 


    df = pd.DataFrame(lpp)
    next_cap = False

    for index, row in df.iterrows():
        # First word
        if index == 0:
            df.at[index,0] = str(row.str.capitalize()[0])
        if next_cap == True:
            df.at[index,0] = str(row.str.capitalize()[0])
        if str(row).__contains__('.') or str(row).__contains__('?') or str(row).__contains__('!'):
            next_cap = True
        else:
            next_cap = False

    df.columns = ['word']
    end = (df.shape[0] * word_bs) + 0.7
    df['onset'] = np.arange(0.7, end, word_bs)
    df['duration'] = np.ones(df.shape[0]) * word_duration
    
    df_sentence_end = pd.DataFrame(columns = df.columns, data = copy.deepcopy(df.values))
    for index, row in df.iterrows():

                
        if index+2 in dict_end_chapter[i]:
            print(f'Adding 2s after the word {row.word} \n')
            for j in np.arange(index+1, df.shape[0]):
                df_sentence_end.at[j, 'onset'] = df_sentence_end.at[j, 'onset'] + end_of_chapter_duration
                ww = df_sentence_end.at[j, 'word']
            

Adding 2s after the word raisonnable... 

Adding 2s after the word prince. 

Adding 2s after the word vieillir. 

Adding 2s after the word l'urgence. 

Adding 2s after the word larmes! 

Adding 2s after the word l'aimer." 

Adding 2s after the word voyage. 

Adding 2s after the word voyage. 

Adding 2s after the word voyage. 

Adding 2s after the word fleur. 

Adding 2s after the word an. 

Adding 2s after the word turent. 

Adding 2s after the word fleur. 

Adding 2s after the word pleura. 

Adding 2s after the word souvenir. 

Adding 2s after the word fontaine..." 

Adding 2s after the word jour. 

Adding 2s after the word sable. 



In [13]:
df_sentence_end[1350:1400]

Unnamed: 0,word,onset,duration
1350,de,488.7,0.25
1351,"bruit,",489.0,0.25
1352,à,489.3,0.25
1353,cause,489.6,0.25
1354,du,489.9,0.25
1355,sable.,490.2,0.25
1356,Et,493.0,0.25
1357,maintenant,493.3,0.25
1358,bien,493.6,0.25
1359,"sûr,",493.9,0.25
