# COMP9318 Project
## Exploratory Data Analysis

In [4]:
import helper as h
import submission as s
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas2arff

Read in word data:

In [6]:
data_loc = 'asset/training_data.txt'
data = h.read_data(data_loc)
words = s.get_words(data)

In [7]:
words.columns

Index(['word', 'pronunciation', 'pn_list', 'destressed_pn_list',
       'primary_stress_map', 'secondary_stress_map', 'vowel_map',
       'consonant_map', 'vector_map', 'vowel_count', 'consonant_count',
       '1st_letter_idx', 'phoneme_length', 'stressed_vowel'],
      dtype='object')

### Facts

Most common stressed vowel in descending order. Compare against total count of particular vowel, can then take proability that vowel will be stressed

In [4]:
# Get count where particular vowel is the stress vowel and count where vowel is in existance
stressed_vowel_count = words.groupby('stressed_vowel')['word'].count().sort_values(ascending=False)
total_vowel_count = words[list(h.vowels)].sum()

# Join the two frames
vowel_counts = pd.concat((stressed_vowel_count,total_vowel_count), axis=1, join='inner')
vowel_counts.columns = ['stressed_vowel_count','total_vowel_count']
vowel_counts['vowel_count_prob'] = vowel_counts.stressed_vowel_count/vowel_counts.total_vowel_count
vowel_counts.sort_values(['vowel_count_prob'],ascending=False)

Unnamed: 0,stressed_vowel_count,total_vowel_count,vowel_count_prob
AE,6442,8104,0.794916
EH,7892,10090,0.782161
AA,6331,8957,0.706821
AO,2952,4260,0.692958
EY,3167,4728,0.669839
OY,297,446,0.665919
UW,2328,3522,0.660988
AW,853,1308,0.652141
UH,608,933,0.651661
AY,2502,4178,0.598851


Find patterns in training data, build list of all possible consecutive sequences

In [5]:
def get_ngrams(pronunciation_list,length):
    return tuple(zip(*(pronunciation_list[i:] for i in range(length))))

def get_sequences(phoneme_series):
    ngrams = {}
    max_length = max(phoneme_series.str.len())
    for i in range(2, max_length + 1):
        for pn_list in phoneme_series:
            # Next iteration if pn_list is shorter then the sequence length be built
            if len(pn_list) < i:
                continue
            word_ngrams = get_ngrams(pn_list,i)
            for ngram in word_ngrams:
                ngrams[ngram] = ngrams.get(ngram,0) + 1
    return ngrams

def in_list(pn_list,ngram):
    if pn_list in ngram:
        return 1
    return 0

def is_primary(ngram):
    for phoneme in ngram:
        if '1' in phoneme:
            return True
    return False

def has_ngram(ngram,ngram_set):
    # Do not check sequences of length 2 or the final as they will obviously be in the set
    for i in range(2,len(ngram)):
        subsequence = ngram[0:i]
        if subsequence in ngram_set :
            return True
    return False




destressed_ngrams = get_sequences(words.destressed_pn_list)
destressed_ngrams_df = pd.DataFrame(list(destressed_ngrams.items()),columns=['destressed_ngram','destressed_ngram_count'])
destressed_ngrams_df = destressed_ngrams_df.set_index('destressed_ngram',drop=False)

# Generate Dataframe with all ngram possibiities and get counts, flag if primary stress in sequence
ngrams = get_sequences(words.pn_list)
ngram_df = pd.DataFrame(list(ngrams.items()),columns=['ngram','ngram_count'])

# Return True is sequence has primary stress in it
ngram_df['Is_Primary'] = ngram_df.ngram.apply(is_primary)
ngram_df['destressed_ngram'] = ngram_df.ngram.apply(h.filter_stress)
ngram_df.destressed_ngram = ngram_df.destressed_ngram.apply(h.as_tuple)
ngram_df = ngram_df.query('Is_Primary == True').set_index('destressed_ngram')

# Join
ngram_priors = ngram_df.join(destressed_ngrams_df)

# Get probability that sequence if exists will be stressed
ngram_priors['ngram_stress_probability'] = ngram_priors.ngram_count/ngram_priors.destressed_ngram_count
ngram_priors['ngram_length'] = ngram_priors.ngram.str.len() 

In [8]:
# Return true if ngram in family
def in_family(family,ngram):
    return family == ngram[0:len(family)]

# Add series to data frame which include the smallest ngram within a larger ngram
def collapse_ngrams(ngram_df,column):
    ngram_df.sort_index(inplace=True)
    ngrams = ngram_df[column].values.tolist()
    ngram_families = []
    current_family = ngrams[0]
    for ngram in ngrams:
        if not in_family(current_family,ngram):
            current_family = ngram
        ngram_families.append(current_family)
    ngram_df['ngram_family'] = pd.Series(ngram_families).values
    return ngram_df

collapse_ngrams(ngram_df,'ngram')

Unnamed: 0_level_0,ngram,ngram_count,Is_Primary,ngram_family
destressed_ngram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(AA, AA)","(AA0, AA1)",1,True,"(AA0, AA1)"
"(AA, AA, K)","(AA0, AA1, K)",1,True,"(AA0, AA1)"
"(AA, AA, K, IY)","(AA0, AA1, K, IY2)",1,True,"(AA0, AA1)"
"(AA, AH)","(AA1, AH0)",5,True,"(AA1, AH0)"
"(AA, AH, L)","(AA1, AH0, L)",1,True,"(AA1, AH0)"
"(AA, AH, M)","(AA1, AH0, M)",1,True,"(AA1, AH0)"
"(AA, AH, N)","(AA1, AH0, N)",2,True,"(AA1, AH0)"
"(AA, AH, N, IH)","(AA1, AH0, N, IH0)",2,True,"(AA1, AH0)"
"(AA, AH, N, IH, NG)","(AA1, AH0, N, IH0, NG)",2,True,"(AA1, AH0)"
"(AA, AH, S)","(AA1, AH0, S)",1,True,"(AA1, AH0)"


In [7]:
valid_seqs = ngram_priors.query('Sequence_Stress_Probability  < 1 & Destressed_Sequence_Count > 5')
valid_seqs.sort_values(by=['Sequence_Stress_Probability','Destressed_Sequence_Count'],ascending=False)
#sequences_df.query('Sequence_Stress_Probability > 0.5  & Destressed_Sequence_Count > 5').sort_values(by='Destressed_Sequence',ascending=False)
ngram_priors.query('Sequence_Stress_Probability > 0.5  & Sequence_Length < 3')

# Doorknob
ngram_priors.loc[[('UW','R'),('AA','B')]]

# Rabon
#RABON:R AA0 B AO1 N
ngram_priors.loc[[('AA','B'),('AO','N')]]

#ABILA:AA0 B IY1 L AH0
ngram_priors.loc[[('AA','B'),('IY','L')]]

UndefinedVariableError: name 'Sequence_Stress_Probability' is not defined


Only show sequences that appear in atleast 1% of words in training data. Compare sequence with stresses included.

In [None]:
sequences_df.sort_values(by='Sequence',ascending=True)

Remove longer super sequences that can be described by a existing sub sequence

In [None]:
def has_subsequence(sequence,sequence_set):
    # Do not check sequences of length 2 or the final as they will obviously be in the set
    for i in range(2,len(sequence)):
        subsequence = sequence[0:i]
        if subsequence in sequence_set :
            return True
    return False

sequences_df['Has_Subsequence'] = sequences_df.Sequence.apply(has_subsequence,args=(set(sequences_df.Sequence),))
sequences_df

### Plots

In [None]:
%matplotlib inline


def scatter_plot(x,y):
    fig = plt.figure()
    ax  = fig.add_subplot(1,1,1)
    ax.scatter(x,y)
    plt.title("{} vs {}".format(y.name,x.name))
    plt.xlabel(x.name)
    plt.ylabel(y.name)
    plt.show()
    return

def histogram(data):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.hist(data,bins='auto')
    plt.title('{} Distribution'.format(data.name))
    plt.xlabel('{}'.format(data.name))
    #plt.xlim(0,bins_count)
    plt.show()
    return    

In [None]:
for tag in words.type_tag.unique():
    tag_df = words[words['type_tag'] == tag]
    print(tag)
    scatter_plot(tag_df['1st_letter_idx'], tag_df.primary_stress_idx)

In [None]:
histogram(words.primary_stress_idx.apply(int))

In [None]:
histogram(sequence_df[sequence_df.Sequence_Count > 0].Sequence_Count)