In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from nltk import pos_tag

vowel_list = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW']
consonant_list = ['P', 'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'R', 'S',
                  'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH']

syllable_list = vowel_list + consonant_list


# vowel_char_combinations = ['A', 'AA', 'AE', 'AEA', 'AEE', 'AEO', 'AEU', 'AI', 'AIA', 'AIE', 'AII', 'AIO', 
#                            'AIU', 'AO', 'AOA', 'AOI', 'AOU', 'AOUE', 'AU', 'AUA', 'AUE', 'AUI', 'E', 'EA', 
#                            'EAU', 'EAUI', 'EE', 'EEA', 'EEI', 'EEU', 'EI', 'EIA', 'EIE', 'EO', 'EOA', 'EOI', 
#                            'EOU', 'EU', 'EUA', 'EUE', 'EUI', 'I', 'IA', 'IAA', 'IAE', 'IAI', 'IAO', 'IAU', 
#                            'IE', 'IEA', 'IEI', 'IEU', 'II', 'IIO', 'IO', 'IOA', 'IOI', 'IOU', 'IU', 'O', 
#                            'OA', 'OAI', 'OE', 'OEA', 'OEI', 'OEU', 'OI', 'OIA', 'OIE', 'OO', 'OOE', 'OOI',
#                            'OU', 'OUA', 'OUE', 'OUEI', 'OUI', 'U', 'UA', 'UAA', 'UAI', 'UAU', 'UE', 'UEA',
#                            'UEE', 'UEI', 'UEOU', 'UEU', 'UEUI', 'UI', 'UIA', 'UIE', 'UO', 'UOI', 'UOIA', 
#                            'UOU', 'UU']

# consonant_char_list = ['B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V'
#                        'W', 'X', 'Y', 'Z']

def is_vowel(v):
    return '0' in v or '1' in v or '2' in v

def has_v_as_ith_vowel_syllable(p, v, i):
    vowels_in_p = [j[:-1] for j in p.split(' ') if is_vowel(j)]
    if i > len(vowels_in_p):
        return False
    return v == vowels_in_p[i-1]

def has_c_as_ith_vowel_char_combination(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    res = []
    j = 0
    while j < len(w):
        l = []
        if w[j] in vowels:
            while w[j] in vowels:
                l.append(w[j])
                j += 1
                if j >= len(w) - 1:
                    break
            res.append(''.join(l))
        else:
            j += 1
    
    if len(res) < i:
        return False
    return res[i-1] == c

def n_syllable_before_ith_vowel_syllable(p, c, i, n):
    l = p.split(' ')
    count = 0
    for j in range(len(l)):
        if is_vowel(l[j]):
            count += 1
            if count == i:
                if j <= n-1:
                    return False
                else:
                    ans = l[j-n]
                    if is_vowel(ans):
                        return ans[:-1] == c
                    else:
                        return ans == c
    return False

def n_syllable_after_ith_vowel_syllable(p, c, i, n):
    l = p.split(' ')
    count = 0
    for j in range(len(l)):
        if is_vowel(l[j]):
            count += 1
            if count == i:
                if j >= len(l) - n:
                    return False
                else:
                    ans = l[j+n]
                    if is_vowel(ans):
                        return ans[:-1] == c
                    else:
                        return ans == c
                
    return False

def has_c_as_consonant_char_before_ith_vowel_char(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    count = 0
    j = 0
    while j < len(w):
        if w[j] in vowels:
            count += 1
            
            if count == i:
                if j == 0:
                    return -1
                else:
                    return w[j-1] == c
            
            
            while w[j] in vowels:
                j += 1
                if j > len(w) - 1:
                    return False
        else:
            j += 1
            
    return False

def has_c_as_consonant_char_after_ith_vowel_char(w, c, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    count = 0
    j = 0
    while j < len(w):
        if w[j] in vowels:
            count += 1
            while w[j] in vowels:
                j += 1
                if j > len(w) - 1:
                    return False
            
                    
            if count == i:
                if j > len(w) - 1:
                    return False
                else:
                    return w[j] == c
        else:
            j += 1
    return False

def find_stress(p):
    l = [syllable for syllable in p.split(' ') if is_vowel(syllable)]
    for i in range(len(l)):
        if '1' in l[i]:
            return i + 1

def nb_of_vowel_char_combination(w, i):
    vowels = {'A', 'E', 'I', 'O', 'U'}
    res = []
    j = 0
    while j < len(w):
        l = []
        if w[j] in vowels:
            while w[j] in vowels:
                l.append(w[j])
                j += 1
                if j >= len(w) - 1:
                    break
            res.append(''.join(l))
        else:
            j += 1
    return i == len(res)





words, pronounciations = [], []
df = pd.DataFrame()
file = open('asset/training_data.txt')
for line in file:
    w, p = line.split(':')
    p = p.rstrip('\n')
    words.append(w)
    pronounciations.append(p)

df['w'] = words
df['p'] = pronounciations

for v in vowel_list:
    for i in range(1, 5):
        df[v+str(i)] = df['p'].apply(lambda p: has_v_as_ith_vowel_syllable(p, v, i))
        
for s in syllable_list:
    for i in range(1, 5):
        for j in range(1, 3):
            name = str(s) + str(j) + 'before' + str(i) + 'vowel syllable'
            df[name] = df['p'].apply(lambda p: n_syllable_before_ith_vowel_syllable(p, s, i, j))

for s in syllable_list:
    for i in range(1, 5):
        for j in range(1, 3):
            name = str(s) + str(j) + 'after' + str(i) + 'vowel syllable'
            df[name] = df['p'].apply(lambda p: n_syllable_after_ith_vowel_syllable(p, s, i, j))

# for c in vowel_char_combinations:
#     for i in range(1, 5):
#         df[c+str(i)] = df['w'].apply(lambda w: has_c_as_ith_vowel_char_combination(w, c, i))        

# for c in consonant_char_list:
#     for i in range(1, 5):
#         df[c+'before_'+str(i)+'_vowel_char'] = df['w'].apply(lambda w: has_c_as_consonant_char_before_ith_vowel_char(w, c, i))

# for c in consonant_char_list:
#     for i in range(1, 5):
#         df[c+'after_'+str(i)+'_vowel_char'] = df['w'].apply(lambda w: has_c_as_consonant_char_after_ith_vowel_char(w, c, i))
        

df['2 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 2)
df['3 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 3)
df['4 vowel syllables'] = df['p'].apply(lambda p: len([i for i in p.split(' ') if is_vowel(i)]) == 4)

df['stress'] = df['p'].apply(find_stress)



In [1411]:
df13 = df[df['stress'] != 4]
df4 = df[df['4 vowel syllables'] == True]

In [57]:
df.iloc[3]

w                                MUSCLING
p                    M AH1 S AH0 L IH0 NG
AA1                                 False
AA2                                 False
AA3                                 False
AA4                                 False
AE1                                 False
AE2                                 False
AE3                                 False
AE4                                 False
AH1                                  True
AH2                                  True
AH3                                 False
AH4                                 False
AO1                                 False
AO2                                 False
AO3                                 False
AO4                                 False
AW1                                 False
AW2                                 False
AW3                                 False
AW4                                 False
AY1                                 False
AY2                               

In [20]:
from sklearn.linear_model import LogisticRegression
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = LogisticRegression()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.625483080254
             precision    recall  f1-score   support

          1       0.89      0.93      0.91      7005
          2       0.78      0.72      0.75      2396
          3       0.74      0.60      0.66       589
          4       1.00      0.10      0.18        10

avg / total       0.85      0.86      0.86     10000



In [56]:
from sklearn.linear_model import LogisticRegressionCV
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.68837853504
             precision    recall  f1-score   support

          1       0.89      0.94      0.92      6903
          2       0.83      0.72      0.77      2514
          3       0.79      0.71      0.75       573
          4       0.33      0.30      0.32        10

avg / total       0.87      0.87      0.87     10000



In [1440]:
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = GaussianNB()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.474188624245
             precision    recall  f1-score   support

          1       0.94      0.74      0.83      6924
          2       0.70      0.55      0.61      2481
          3       0.22      0.93      0.36       590
          4       0.05      0.80      0.10         5

avg / total       0.84      0.71      0.75     10000



In [1445]:
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = MultinomialNB()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.65926002234
             precision    recall  f1-score   support

          1       0.92      0.91      0.91      6949
          2       0.83      0.78      0.81      2436
          3       0.61      0.83      0.70       605
          4       0.13      0.70      0.22        10

avg / total       0.88      0.87      0.87     10000



In [1451]:
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
train, test = train_test_split(df, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf = BernoulliNB()
clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.674594775475
             precision    recall  f1-score   support

          1       0.91      0.90      0.91      6939
          2       0.82      0.75      0.79      2468
          3       0.59      0.85      0.70       583
          4       0.21      0.60      0.31        10

avg / total       0.87      0.86      0.86     10000



In [1422]:
df4 = df[df['4 vowel syllables'] == True]
df4_1 = df4[df4['stress'] == 1]
df4_2 = df4[df4['stress'] == 2]
df4_3 = df4[df4['stress'] == 3]
df4_4 = df4[df4['stress'] == 4]
df4 = df4_1.append(df4_2).append(df4_3).append(df4_4)

from sklearn.linear_model import LogisticRegressionCV
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

res = []
train, test = train_test_split(df4, test_size = 0.2)

X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf4 = LogisticRegression()
clf4.fit(X_train, Y_train)
prediction = clf4.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.763237386295
             precision    recall  f1-score   support

          1       0.86      0.81      0.83       298
          2       0.92      0.95      0.94       422
          3       0.89      0.91      0.90       464
          4       0.57      0.29      0.38        14

avg / total       0.89      0.89      0.89      1198



In [1405]:
alpha = 10
df4 = df[df['4 vowel syllables'] == True].copy()
df41 = df4[df4['stress'] == 1]
df42 = df4[df4['stress'] == 2]
df43 = df4[df4['stress'] == 3]
df44 = df4[df4['stress'] == 4]
df4not4 = df41.append(df42).append(df43)
# df4not4['stress'] = 0
df4 = df44.append(df4not4)

from sklearn.linear_model import LogisticRegressionCV
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

res = []
for i in range(20):
    train, test = train_test_split(df4, test_size = 0.2)

    X_train = train[features]
    Y_train = train.stress

    X_test = test[features]
    Y_test = test.stress

    clf4 = LogisticRegression()
    clf4.fit(X_train, Y_train)
    prediction = clf4.predict(X_test)

    res.append(f1_score(Y_test, prediction, average='macro'))
#     print(classification_report(Y_test, prediction))
np.mean(res), np.std(res)

  'precision', 'predicted', average, warn_for)


(0.73154351623287106, 0.032107516214232107)

In [1404]:
alpha = 10
df4 = df[df['4 vowel syllables'] == True].copy()
df41 = df4[df4['stress'] == 1]
df42 = df4[df4['stress'] == 2]
df43 = df4[df4['stress'] == 3]
df44 = df4[df4['stress'] == 4]
df4not4 = df41.append(df42).append(df43)
df4not4['stress'] = 0
df4 = df44.append(df4not4)

from sklearn.linear_model import LogisticRegressionCV
features = list(df.columns)
features.remove('w')
features.remove('p')
features.remove('stress')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


train, test = train_test_split(df4, test_size = 0.2)

for i in range(7):
    train = train.append(train[train['stress'] == 4].copy())


X_train = train[features]
Y_train = train.stress

X_test = test[features]
Y_test = test.stress

clf4 = LogisticRegression()
clf4.fit(X_train, Y_train)
prediction = clf4.predict(X_test)

print(f1_score(Y_test, prediction, average='macro'))
print(classification_report(Y_test, prediction))

0.656415014762
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1188
          4       0.27      0.40      0.32        10

avg / total       0.99      0.99      0.99      1198



In [1395]:
train.groupby(['stress']).agg(['count'])

Unnamed: 0_level_0,w,p,AA1,AA2,AA3,AA4,AE1,AE2,AE3,AE4,...,Yafter_2_vowel_char,Yafter_3_vowel_char,Yafter_4_vowel_char,Zafter_1_vowel_char,Zafter_2_vowel_char,Zafter_3_vowel_char,Zafter_4_vowel_char,2 vowel syllables,3 vowel syllables,4 vowel syllables
Unnamed: 0_level_1,count,count,count,count,count,count,count,count,count,count,...,count,count,count,count,count,count,count,count,count,count
stress,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,4744,4744,4744,4744,4744,4744,4744,4744,4744,4744,...,4744,4744,4744,4744,4744,4744,4744,4744,4744,4744
4,5632,5632,5632,5632,5632,5632,5632,5632,5632,5632,...,5632,5632,5632,5632,5632,5632,5632,5632,5632,5632
