# COMP9318 Project
## Exploratory Data Analysis

In [33]:
import helper as h
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas2arff

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

Read in word data:

In [3]:
data_loc = 'asset/training_data.txt'
words = h.get_words(data_loc)

In [3]:
words.head()

Unnamed: 0,word,pronunciation,pn_list,destressed_pn_list,primary_stress_map,secondary_stress_map,vowel_map,consonant_map,vector_map,vowel_count,...,R,S,SH,T,TH,V,W,Y,Z,ZH
0,COED,K OW1 EH2 D,"[K, OW1, EH2, D]","[K, OW, EH, D]","[0, 1, 0, 0]","[0, 0, 1, 0]","[0, 1, 1, 0]","[1, 0, 0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...",2,...,0,0,0,0,0,0,0,0,0,0
1,PURVIEW,P ER1 V Y UW2,"[P, ER1, V, Y, UW2]","[P, ER, V, Y, UW]","[0, 1, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 0, 0, 1]","[1, 0, 1, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...",2,...,0,0,0,0,0,1,0,1,0,0
2,HEHIR,HH EH1 HH IH0 R,"[HH, EH1, HH, IH0, R]","[HH, EH, HH, IH, R]","[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 1, 0, 1, 0]","[1, 0, 1, 0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",2,...,1,0,0,0,0,0,0,0,0,0
3,MUSCLING,M AH1 S AH0 L IH0 NG,"[M, AH1, S, AH0, L, IH0, NG]","[M, AH, S, AH, L, IH, NG]","[0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 1, 0, 1, 0]","[1, 0, 1, 0, 1, 0, 1]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3,...,0,1,0,0,0,0,0,0,0,0
4,NONPOISONOUS,N AA0 N P OY1 Z AH0 N AH0 S,"[N, AA0, N, P, OY1, Z, AH0, N, AH0, S]","[N, AA, N, P, OY, Z, AH, N, AH, S]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 1, 0, 1, 0, 1, 0]","[1, 0, 1, 1, 0, 1, 0, 1, 0, 1]","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",4,...,0,1,0,0,0,0,0,0,1,0


In [4]:
words.columns

Index(['word', 'pronunciation', 'pn_list', 'destressed_pn_list',
       'primary_stress_map', 'secondary_stress_map', 'vowel_map',
       'consonant_map', 'vector_map', 'vowel_count', 'consonant_count',
       'type_tag', '1st_letter_idx', 'phoneme_length', 'prefix', 'suffix',
       'primary_stress_idx', 'stressed_vowel', 'AA', 'AE', 'AH', 'AO', 'AW',
       'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW', 'P', 'B',
       'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'R',
       'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'],
      dtype='object')

### Facts

Most common stressed vowel in descending order. Compare against total count of particular vowel, can then take proability that vowel will be stressed

In [4]:
# Get count where particular vowel is the stress vowel and count where vowel is in existance
stressed_vowel_count = words.groupby('stressed_vowel')['word'].count().sort_values(ascending=False)
total_vowel_count = words[list(h.vowels)].sum()

# Join the two frames
vowel_counts = pd.concat((stressed_vowel_count,total_vowel_count), axis=1, join='inner')
vowel_counts.columns = ['stressed_vowel_count','total_vowel_count']
vowel_counts['vowel_count_prob'] = vowel_counts.stressed_vowel_count/vowel_counts.total_vowel_count
vowel_counts.sort_values(['vowel_count_prob'],ascending=False)

Unnamed: 0,stressed_vowel_count,total_vowel_count,vowel_count_prob
AE,6442,8104,0.794916
EH,7892,10090,0.782161
AA,6331,8957,0.706821
AO,2952,4260,0.692958
EY,3167,4728,0.669839
OY,297,446,0.665919
UW,2328,3522,0.660988
AW,853,1308,0.652141
UH,608,933,0.651661
AY,2502,4178,0.598851


Find patterns in training data, build list of all possible consecutive sequences

In [28]:
def sub_string(pronunciation_list,length):
    return tuple(zip(*(pronunciation_list[i:] for i in range(length))))

def get_sequences(phoneme_series):
    sequences = {}
    max_length = max(phoneme_series.str.len())
    for i in range(2, max_length + 1):
        for pn_list in phoneme_series:
            # Next iteration if pn_list is shorter then the sequence length be built
            if len(pn_list) < i:
                continue
            word_sequences = sub_string(pn_list,i)
            for seq in word_sequences:
                sequences[seq] = sequences.get(seq,0) + 1
    return sequences

def in_list(pn_list,sequence):
    if pn_list in sequence:
        return 1
    return 0

def is_primary(sequence):
    for phoneme in sequence:
        if '1' in phoneme:
            return True
    return False

# Generate Dataframe with all destressed sequence possibilities and get counts
destressed_sequences = get_sequences(words.destressed_pn_list)
destressed_sequence_df = pd.DataFrame(list(destressed_sequences.items()),columns=['Destressed_Sequence','Destressed_Sequence_Count'])
destressed_sequence_df = destressed_sequence_df.set_index('Destressed_Sequence')
# Generate Dataframe with all sequence possibiities and get counts, flag if primary stress in sequence
sequences = get_sequences(words.pn_list)
sequence_df = pd.DataFrame(list(sequences.items()),columns=['Sequence','Sequence_Count'])

# Return True is sequence has primary stress in it
sequence_df['Is_Primary'] = sequence_df.Sequence.apply(is_primary)
sequence_df['Destressed_Sequence'] = sequence_df.Sequence.apply(h.filter_stress)
sequence_df.Destressed_Sequence = sequence_df.Destressed_Sequence.apply(h.as_tuple)
sequence_df = sequence_df.query('Is_Primary == True').set_index('Destressed_Sequence')

# Join 
sequences_df = sequence_df.join(destressed_sequence_df)

# Get probability that sequence if exists will be stressed
sequences_df['Sequence_Stress_Probability'] = sequences_df.Sequence_Count/sequences_df.Destressed_Sequence_Count
sequences_df['Sequence_Length'] = sequences_df.Sequence.str.len()

In [55]:
valid_seqs = sequences_df.query('Sequence_Stress_Probability  < 1 & Destressed_Sequence_Count > 5')
valid_seqs.sort_values(by=['Sequence_Stress_Probability','Destressed_Sequence_Count'],ascending=False)
#sequences_df.query('Sequence_Stress_Probability > 0.5  & Destressed_Sequence_Count > 5').sort_values(by='Destressed_Sequence',ascending=False)
sequences_df.query('Sequence_Stress_Probability > 0.5  & Sequence_Length < 3')

# Doorknob
sequences_df.loc[[('UW','R'),('AA','B')]]

# Rabon
#RABON:R AA0 B AO1 N
sequences_df.loc[[('AA','B'),('AO','N')]]

#ABILA:AA0 B IY1 L AH0
sequences_df.loc[[('AA','B'),('IY','L')]]




Unnamed: 0_level_0,Sequence,Sequence_Count,Is_Primary,Destressed_Sequence_Count,Sequence_Stress_Probability,Sequence_Length
Destressed_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(AA, B)","(AA1, B)",182,True,239,0.761506,2
"(IY, L)","(IY1, L)",276,True,506,0.545455,2



Only show sequences that appear in atleast 1% of words in training data. Compare sequence with stresses included.

In [25]:
sequences_df.query('Is_Primary == True').sort_values(by='Sequence_Count',ascending=False)

Unnamed: 0_level_0,Sequence,Sequence_Count,Is_Primary,Destressed_Sequence_Count,Sequence_Stress_Probability
Destressed_Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(AA, R)","(AA1, R)",1647,True,2586,0.636891
"(AO, R)","(AO1, R)",1622,True,2353,0.689333
"(AE, N)","(AE1, N)",1517,True,2070,0.732850
"(EH, N)","(EH1, N)",1322,True,1756,0.752847
"(EH, L)","(EH1, L)",1321,True,1737,0.760507
"(EH, R)","(EH1, R)",1094,True,1431,0.764500
"(AA, N)","(AA1, N)",1017,True,1609,0.632070
"(IH, N)","(IH1, N)",971,True,2963,0.327708
"(R, EH)","(R, EH1)",896,True,1128,0.794326
"(K, AA)","(K, AA1)",825,True,1324,0.623112


### Plots

In [20]:
%matplotlib inline


def scatter_plot(x,y):
    fig = plt.figure()
    ax  = fig.add_subplot(1,1,1)
    ax.scatter(x,y)
    plt.title("{} vs {}".format(y.name,x.name))
    plt.xlabel(x.name)
    plt.ylabel(y.name)
    plt.show()
    return

def histogram(data):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.hist(data,bins='auto')
    plt.title('{} Distribution'.format(data.name))
    plt.xlabel('{}'.format(data.name))
    #plt.xlim(0,bins_count)
    plt.show()
    return    

In [None]:
for tag in words.type_tag.unique():
    tag_df = words[words['type_tag'] == tag]
    print(tag)
    scatter_plot(tag_df['1st_letter_idx'], tag_df.primary_stress_idx)

In [None]:
histogram(words.primary_stress_idx.apply(int))

In [None]:
histogram(sequence_df[sequence_df.Sequence_Count > 0].Sequence_Count)

KeyboardInterrupt: 