In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from nltk import pos_tag

vowel_list = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW']
consonant_list = ['P', 'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'R', 'S',
                  'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH']

syllable_list = vowel_list + consonant_list


# vowel_char_combinations = ['A', 'AA', 'AE', 'AEA', 'AEE', 'AEO', 'AEU', 'AI', 'AIA', 'AIE', 'AII', 'AIO', 
#                            'AIU', 'AO', 'AOA', 'AOI', 'AOU', 'AOUE', 'AU', 'AUA', 'AUE', 'AUI', 'E', 'EA', 
#                            'EAU', 'EAUI', 'EE', 'EEA', 'EEI', 'EEU', 'EI', 'EIA', 'EIE', 'EO', 'EOA', 'EOI', 
#                            'EOU', 'EU', 'EUA', 'EUE', 'EUI', 'I', 'IA', 'IAA', 'IAE', 'IAI', 'IAO', 'IAU', 
#                            'IE', 'IEA', 'IEI', 'IEU', 'II', 'IIO', 'IO', 'IOA', 'IOI', 'IOU', 'IU', 'O', 
#                            'OA', 'OAI', 'OE', 'OEA', 'OEI', 'OEU', 'OI', 'OIA', 'OIE', 'OO', 'OOE', 'OOI',
#                            'OU', 'OUA', 'OUE', 'OUEI', 'OUI', 'U', 'UA', 'UAA', 'UAI', 'UAU', 'UE', 'UEA',
#                            'UEE', 'UEI', 'UEOU', 'UEU', 'UEUI', 'UI', 'UIA', 'UIE', 'UO', 'UOI', 'UOIA', 
#                            'UOU', 'UU']

# consonant_char_list = ['B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V'
#                        'W', 'X', 'Y', 'Z']

def is_vowel(v):
    return '0' in v or '1' in v or '2' in v

def has_v_as_ith_vowel_syllable(p, v, i):
    vowels_in_p = [j[:-1] for j in p.split(' ') if is_vowel(j)]
    if i > len(vowels_in_p):
        return False
    return v == vowels_in_p[i-1]

def has_c_as_ith_vowel_char_combination(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    res = []
    j = 0
    while j < len(w):
        l = []
        if w[j] in vowels:
            while w[j] in vowels:
                l.append(w[j])
                j += 1
                if j >= len(w) - 1:
                    break
            res.append(''.join(l))
        else:
            j += 1
    
    if len(res) < i:
        return False
    return res[i-1] == c

def n_syllable_before_ith_vowel_syllable(p, c, i, n):
    l = p.split(' ')
    count = 0
    for j in range(len(l)):
        if is_vowel(l[j]):
            count += 1
            if count == i:
                if j <= n-1:
                    return False
                else:
                    ans = l[j-n]
                    if is_vowel(ans):
                        return ans[:-1] == c
                    else:
                        return ans == c
    return False

def n_syllable_after_ith_vowel_syllable(p, c, i, n):
    l = p.split(' ')
    count = 0
    for j in range(len(l)):
        if is_vowel(l[j]):
            count += 1
            if count == i:
                if j >= len(l) - n:
                    return False
                else:
                    ans = l[j+n]
                    if is_vowel(ans):
                        return ans[:-1] == c
                    else:
                        return ans == c
                
    return False

def has_c_as_consonant_char_before_ith_vowel_char(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    count = 0
    j = 0
    while j < len(w):
        if w[j] in vowels:
            count += 1
            
            if count == i:
                if j == 0:
                    return -1
                else:
                    return w[j-1] == c
            
            
            while w[j] in vowels:
                j += 1
                if j > len(w) - 1:
                    return False
        else:
            j += 1
            
    return False

def has_c_as_consonant_char_after_ith_vowel_char(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    count = 0
    j = 0
    while j < len(w):
        if w[j] in vowels:
            count += 1
            while w[j] in vowels:
                j += 1
                if j > len(w) - 1:
                    return False
            
                    
            if count == i:
                if j > len(w) - 1:
                    return False
                else:
                    return w[j] == c
        else:
            j += 1
    return False

def find_stress(p):
    l = [syllable for syllable in p.split(' ') if is_vowel(syllable)]
    for i in range(len(l)):
        if '1' in l[i]:
            return i + 1

def nb_of_vowel_char_combination(w, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    res = []
    j = 0
    while j < len(w):
        l = []
        if w[j] in vowels:
            while w[j] in vowels:
                l.append(w[j])
                j += 1
                if j >= len(w) - 1:
                    break
            res.append(''.join(l))
        else:
            j += 1
    return i == len(res)





words, pronounciations = [], []
df = pd.DataFrame()
file = open('asset/training_data.txt')
for line in file:
    w, p = line.split(':')
    p = p.rstrip('\n')
    words.append(w)
    pronounciations.append(p)

df['w'] = words
df['p'] = pronounciations

for v in vowel_list:
    for i in range(1, 5):
        df[v+str(i)] = df['p'].apply(lambda p: has_v_as_ith_vowel_syllable(p, v, i))
        
for s in syllable_list:
    for i in range(1, 5):
        for j in range(1, 3):
            name = str(s) + str(j) + 'before' + str(i) + 'vowel syllable'
            df[name] = df['p'].apply(lambda p: n_syllable_before_ith_vowel_syllable(p, s, i, j))

for s in syllable_list:
    for i in range(1, 5):
        for j in range(1, 3):
            name = str(s) + str(j) + 'after' + str(i) + 'vowel syllable'
            df[name] = df['p'].apply(lambda p: n_syllable_after_ith_vowel_syllable(p, s, i, j))

# for c in vowel_char_combinations:
#     for i in range(1, 5):
#         df[c+str(i)] = df['w'].apply(lambda w: has_c_as_ith_vowel_char_combination(w, c, i))        

# for c in consonant_char_list:
#     for i in range(1, 5):
#         df[c+'before_'+str(i)+'_vowel_char'] = df['w'].apply(lambda w: has_c_as_consonant_char_before_ith_vowel_char(w, c, i))

# for c in consonant_char_list:
#     for i in range(1, 5):
#         df[c+'after_'+str(i)+'_vowel_char'] = df['w'].apply(lambda w: has_c_as_consonant_char_after_ith_vowel_char(w, c, i))
        

df['2 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 2)
df['3 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 3)
df['4 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 4)

df['stress'] = df['p'].apply(find_stress)



In [11]:
from sklearn.linear_model import LogisticRegression
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = LogisticRegression()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.756217748207
             precision    recall  f1-score   support

          1       0.92      0.96      0.94      6965
          2       0.87      0.82      0.85      2462
          3       0.89      0.72      0.79       568
          4       0.50      0.40      0.44         5

avg / total       0.91      0.91      0.91     10000



In [12]:
import snowballstemmer
stemmer = snowballstemmer.stemmer("english")

In [28]:
stemmer.stemWord("oversupplied")

'oversuppli'

In [22]:
df['w'] = df['w'].apply(lambda w: str.lower(w))

In [26]:
df[df['stress'] == 4][['w', 'p']]

Unnamed: 0,w,p
327,laviolette,L AE2 V IY0 OW0 L EH1 T
1105,naivete,N AA0 IY2 V AH0 T EY1
1725,arbitrageurs,AA2 R B AH0 T R AA2 ZH ER1 Z
4253,natividad,N AH2 T IH0 V IH0 D AA1 D
5016,santistevan,S AA2 N T IY0 S T EY0 V AA1 N
5399,inopportune,IH2 N AA2 P ER0 T UW1 N
8411,bellefeuille,B EH2 L AH0 F IY0 UW1 L
8720,nitrosomines,N IH0 T R AA2 S AH0 M IY1 N Z
8918,misrepresents,M IH0 S R EH2 P R AH0 Z EH1 N T S
9162,azerbaijan,AA2 Z ER0 B AY0 JH AA1 N


In [30]:
len(df.columns) - 3

687

In [36]:
15 * 4 + (39 * 16) + 3

687