In [23]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer 

___
- Download wordnet for lemmatization
- Download cmudict for syllable count
___

In [24]:
import nltk
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('wordnet')
nltk.download('cmudict')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/farhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to /Users/farhan/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/farhan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

___
- Variable information
- Path of input csv files
- Path of output csv file
___

In [25]:
subtlex_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/SUBTLEX-US_frequency_list.csv'
profane_path = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/profaneWords.txt'
output_directory = '/Users/farhan/Desktop/Baycrest Documents/Aphasia_Study/Aphasia_STM_stim_generation/Syllable_Project/'

___
- Read in the required csv files
    - Read in the list of words from SUBTLEX
    - Read in the list of profane words to be used for cross checking
___

In [26]:
## Dataframe constructed from SUBTLEX
subtlex = pd.read_csv(subtlex_path)

## List of profane words
profane = open(profane_path, 'r')
profane_words_temp = profane.readlines()
profane_words = []
for words in profane_words_temp:
    profane_words.append(words.strip("\n"))

___
- Function to get syllable count of word from cmudict
- returns -1 if word not in cmudict
___

In [27]:
from nltk.corpus import cmudict
cdict = cmudict.dict()

def syllable_count(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in cdict[word.lower()]][0]
    except KeyError:
        return -1

___
- Step 1
- Filtering by frequency
- Keep rows with 4 < Zipfvalue < 5
___

In [28]:
subtlex = subtlex.loc[subtlex.Zipfvalue < 5]
subtlex = subtlex.loc[subtlex.Zipfvalue > 4].reset_index(drop = True)

___
- Step 2a
- Add column with syllable count
- call syllable_count function on each word
- use lambda function to partially vectorize process
___

In [29]:
subtlex["syllables"] = subtlex.apply(lambda x: syllable_count(x['Word']), axis=1)

___
- Step 2b
- Add column with lemma of the words
- Need to use lemmatizer from nltk
___

In [30]:
## Create an instance of lemmatizer
lemmatizer = WordNetLemmatizer()

## Call lemmatizer on each word using lambda function
subtlex['Word'] = subtlex.apply(lambda x: lemmatizer.lemmatize(x['Word']), axis=1)
subtlex

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Zipfvalue,syllables
0,aah,2688,634,52,37,52.71,3.4296,7.56,2.8028,Interjection,2657.0,1.00,Interjection,2657,4.721425,-1
1,aaron,747,162,0,0,14.65,2.8739,1.93,2.2122,Name,744.0,1.00,Name,744,4.165736,2
2,abandoned,678,538,650,522,13.29,2.8319,6.41,2.7316,Verb,480.0,0.71,Verb.Adjective,480.200,4.123704,3
3,abby,637,102,0,0,12.49,2.8048,1.22,2.0128,Name,632.0,1.00,Name,632,4.096655,2
4,ability,980,679,974,673,19.22,2.9917,8.09,2.8325,Noun,975.0,1.00,Noun,975,4.283503,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3618,z,598,227,136,71,11.73,2.7774,2.71,2.3579,Letter,369.0,0.90,Letter.Name,369.43,4.069261,1
3619,zack,1056,100,0,0,20.71,3.0241,1.19,2.0043,Name,515.0,0.49,Name.Verb.Noun,515.416.116,4.315909,1
3620,zero,1094,651,694,519,21.45,3.0394,7.76,2.8142,Number,570.0,0.52,Number.Noun.Verb.Name,570.431.83.9,4.331248,2
3621,zone,1026,622,856,543,20.12,3.0116,7.42,2.7945,Noun,1023.0,1.00,Noun,1023,4.303405,1


___
- Filtering by syllable count
- Keep rows with syllables <= 2
- Also, drop rows with unfound syllable count
___

In [31]:
subtlex = subtlex.loc[subtlex.syllables <= 2].reset_index(drop = True)

unfound_count = len(subtlex.loc[subtlex.syllables == -1].reset_index(drop = True))
subtlex = subtlex.loc[subtlex.syllables > 0].reset_index(drop = True)
# subtlex

___
- Filter out profane words
- Filter out Names
- Filter out single lettered words
___

In [32]:
## Drop profane words
subtlex = subtlex.loc[~subtlex.Word.isin(profane_words)].reset_index(drop = True)

## Drop names
subtlex = subtlex.loc[subtlex.Dom_PoS_SUBTLEX != 'Name'].reset_index(drop = True)

## Drop single lettered words
subtlex = subtlex.loc[subtlex.All_PoS_SUBTLEX.str.contains("Letter") == False].reset_index(drop = True)
# subtlex

___
Convert dataframe to SOS acceptable format
___

In [33]:
subtlex = subtlex.rename(columns={'Word': 'Word|s', 'FREQcount': 'FREQcount|f', 'CDcount': 'CDcount|f', 'FREQlow': 'FREQlow|f', 'Cdlow': 'Cdlow|f',
    'SUBTLWF': 'SUBTLWF|f', 'Lg10WF': 'Lg10WF|f', 'SUBTLCD': 'SUBTLCD|f', 'Lg10CD': 'Lg10CD|f', 'Dom_PoS_SUBTLEX': 'Dom_PoS_SUBTLEX|s', 
    'Freq_dom_PoS_SUBTLEX': 'Freq_dom_PoS_SUBTLEX|f','Percentage_dom_PoS': 'Percentage_dom_PoS|f', 'All_PoS_SUBTLEX': 'All_PoS_SUBTLEX|s',
    'All_freqs_SUBTLEX': 'All_freqs_SUBTLEX|f', 'Zipfvalue': 'Zipfvalue|f','syllables': 'syllables|f'}, errors="raise")

___
- Steps 1, 2, 3 done
- Output the modified csv to run through SOS
___

In [34]:
subtlex.to_csv(output_directory + 'sos_input.txt', sep='\t', index=False)
# df.rename(columns=lambda x: x.strip())