In [134]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer 

___
- Download wordnet for lemmatization
- Download cmudict for syllable count
___

In [135]:
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
nltk.download('wordnet')
nltk.download('cmudict')

[nltk_data] Downloading package wordnet to /Users/farhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to /Users/farhan/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

___
- Variable information
- Path of input csv files
- Path of output csv file
___

In [136]:
subtlex_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/SUBTLEX-US_frequency_list.csv'
goodwords_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/goodwords_cmusyllables.csv'
output_directory = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/'

___
- Read in the required csv files
___

In [137]:
subtlex = pd.read_csv(subtlex_path)

___
- Function to get syllable count of word from cmudict
- returns -1 if word not in cmudict
___

In [138]:
from nltk.corpus import cmudict
cdict = cmudict.dict()

def syllable_count(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in cdict[word.lower()]][0]
    except KeyError:
        return -1

___
- Step 1
- Filtering by frequency
- Keep rows with 4 < Zipfvalue < 5
___

In [139]:
subtlex = subtlex.loc[subtlex.Zipfvalue < 5]
subtlex = subtlex.loc[subtlex.Zipfvalue > 4].reset_index(drop = True)

___
- Step 2a
- Add column with syllable count
- call syllable_count function on each word
- use lambda function to partially vectorize process
___

In [140]:
subtlex["syllables"] = subtlex.apply(lambda x: syllable_count(x['Word']), axis=1)

___
- Step 2b
- Add column with lemma of the words
- Need to use lemmatizer from nltk
___

In [141]:
## Create an instance of lemmatizer
lemmatizer = WordNetLemmatizer()

## Call lemmatizer on each word using lambda function
subtlex['Word'] = subtlex.apply(lambda x: lemmatizer.lemmatize(x['Word']), axis=1)

___
- Filtering by syllable count
- Keep rows with syllables <= 2
- Also, drop rows with unfound syllable count
___

In [142]:
subtlex = subtlex.loc[subtlex.syllables <= 2].reset_index(drop = True)

unfound_count = len(subtlex.loc[subtlex.syllables == -1].reset_index(drop = True))
subtlex = subtlex.loc[subtlex.syllables > 0].reset_index(drop = True)

___
Convert dataframe to SOS acceptable format
___

In [143]:
subtlex = subtlex.rename(columns={'Word': 'Word|s', 'FREQcount': 'FREQcount|f', 'CDcount': 'CDcount|f', 'FREQlow': 'FREQlow|f', 'Cdlow': 'Cdlow|f',
    'SUBTLWF': 'SUBTLWF|f', 'Lg10WF': 'Lg10WF|f', 'SUBTLCD': 'SUBTLCD|f', 'Lg10CD': 'Lg10CD|f', 'Dom_PoS_SUBTLEX': 'Dom_PoS_SUBTLEX|s', 
    'Freq_dom_PoS_SUBTLEX': 'Freq_dom_PoS_SUBTLEX|f','Percentage_dom_PoS': 'Percentage_dom_PoS|f', 'All_PoS_SUBTLEX': 'All_PoS_SUBTLEX|s',
    'All_freqs_SUBTLEX': 'All_freqs_SUBTLEX|f', 'Zipfvalue': 'Zipfvalue|f','syllables': 'syllables|f'}, errors="raise")

___
- Steps 1, 2, 3 done
- Output the modified csv to run through SOS
___

In [144]:
subtlex.to_csv(output_directory + 'sos_input.txt', sep='\t', index=False)
# df.rename(columns=lambda x: x.strip())