In [183]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer 

___
- Download wordnet for lemmatization
- Download cmudict for syllable count
___

In [184]:
# nltk.download('wordnet')
# nltk.download('cmudict')

___
- Variable information
- Path of input csv files
- Path of output csv file
___

In [185]:
subtlex_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/SUBTLEX-US_frequency_list.csv'
goodwords_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/goodwords_cmusyllables.csv'
output_directory = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/'

___
- Read in the required csv files
___

In [186]:
subtlex = pd.read_csv(subtlex_path)
goodwords = pd.read_csv(goodwords_path)

___
- Function to get syllable count of word from cmudict
- returns -1 if word not in cmudict
___

In [187]:
cdict = cmudict.dict()

def syllable_count(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in cdict[word.lower()]][0]
    except KeyError:
        return -1

___
- Step 1
- Filtering by frequency
- Keep rows with 4 < Zipfvalue < 5
___

In [188]:
subtlex = subtlex.loc[subtlex.Zipfvalue < 5]
subtlex = subtlex.loc[subtlex.Zipfvalue > 4].reset_index(drop = True)

___
- Step 2a
- Add column with syllable count
- call syllable_count function on each word
- use lambda function to partially vectorize process
___

In [189]:
subtlex["syllables"] = subtlex.apply(lambda x: syllable_count(x['Word']), axis=1)

___
- Step 2b
- Add column with lemma of the words
- Need to use lemmatizer from nltk
___

In [190]:
## Create an instance of lemmatizer
lemmatizer = WordNetLemmatizer()

## Call lemmatizer on each word using lambda function
subtlex['lemma'] = subtlex.apply(lambda x: lemmatizer.lemmatize(x['Word']), axis=1)

___
- Filtering by syllable count
- Keep rows with syllables <= 2
- Also, drop rows with unfound syllable count
___

In [191]:
subtlex = subtlex.loc[subtlex.syllables <= 2].reset_index(drop = True)

unfound_count = len(subtlex.loc[subtlex.syllables == -1].reset_index(drop = True))
subtlex = subtlex.loc[subtlex.syllables > 0].reset_index(drop = True)

___
- Steps 1, 2, 3 done
- Output the modified csv to run through SOS
___

In [193]:
subtlex.to_csv(output_directory + 'sos_input.csv') 
kikos = pd.read_csv(output_directory + 'sos_input.csv')
kikos.columns.to_list()
# df.rename(columns=lambda x: x.strip())

['Unnamed: 0',
 'Word',
 'FREQcount',
 'CDcount',
 'FREQlow',
 'Cdlow',
 'SUBTLWF',
 'Lg10WF',
 'SUBTLCD',
 'Lg10CD',
 'Dom_PoS_SUBTLEX',
 'Freq_dom_PoS_SUBTLEX',
 'Percentage_dom_PoS',
 'All_PoS_SUBTLEX',
 'All_freqs_SUBTLEX',
 'Zipfvalue',
 'syllables',
 'lemma']