# Heat maps --  vowels CV

1. Read in the vowel CV data
2. Update column headers for consistency

In [1]:
setwd("/home/ayushi/Projects_2020/Naturalness/obstruents_and_vowels/scripts/")
#install.packages("itertools")
library(dplyr)
library(ggpubr)
library(tidyverse)
library(glue)
library(rstatix)
library(itertools)

vwl_dur_cv = read.csv("../text_data/V.01.Duration.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_amp_cv = read.csv("../text_data/V.02.Amplitude.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_spec_cv = read.csv("../text_data/V.03.Spectrals.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_lowF_cv = read.csv("../text_data/V.04.Lower.Formants.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_highF_cv = read.csv("../text_data/V.05.Higher.Formants.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
#str(vwl_highF_cv)
vwl_lowF_cv = vwl_lowF_cv %>% rename(Word.Index = Word_Index, Phoneme.Index = Phoneme_Index, Phoneme = Vowel)
vwl_highF_cv = vwl_highF_cv %>% rename(Word.Index = Word_Index, Phoneme.Index = Phoneme_Index, Phoneme = Vowel)


summary(vwl_spec_cv$Sys_Name)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggplot2

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse



Attaching package: ‘rstatix’


ERROR: Error in file(file, "rt"): cannot open the connection


3. Create merged dataset
4. Add new features (Delta-Fn, Fn_dispersion)
5. Update columns to reflect neural voices, combine HMM-P + HMM-WP

In [114]:
identity_cols <- c("Filename", "Word.Index", "Word", "Phoneme.Index", "Phoneme", "Sys_Name")

## Create a unique field using ^^ identity_cols
SortUniqueId <- function(df) {
    df$UnID = do.call(paste, c(df[identity_cols], sep = "-"))
    df = df[order(df$UnID),]
}

Vwl.Duration_CV = SortUniqueId(vwl_dur_cv)
Vwl.Amplitude_CV = SortUniqueId(vwl_amp_cv)
Vwl.Spectrals_CV = SortUniqueId(vwl_spec_cv)
Vwl.LowF_CV = SortUniqueId(vwl_lowF_cv)
Vwl.HighF_CV = SortUniqueId(vwl_highF_cv)

#### Merge the data ##
Merged_VowelData_CV = Reduce(merge, list( Vwl.Duration_CV, Vwl.Amplitude_CV, Vwl.Spectrals_CV, Vwl.LowF_CV, Vwl.HighF_CV))



summary(Merged_VowelData_CV$Sys_Name) ## This is important to see if all the systems have the same number of data points



In [15]:
Merged_VowelData_CV = select(Merged_VowelData_CV, -F3_band_onset, -F3_band_midpoint, -F5_band_onset, -F5_band_midpoint)

summary(Merged_VowelData_CV$Sys_Name) ## This is important to see if all the systems have the same number of data points

### Display all the column names with their indices
iname = enumerate(colnames(Merged_VowelData_CV))
#cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
print(summary(Merged_VowelData_CV))

#Merged_VowelData_CV[, c("F5_onset", "F5_midpoint", "Rel_Amp_F5")] = sapply(Merged_VowelData_CV, as.character)
#Merged_VowelData_CV[, c("F5_onset", "F5_midpoint", "Rel_Amp_F5")] = sapply(Merged_VowelData_CV, as.numeric)

Combined_VowelData_CV = Merged_VowelData_CV %>% group_by(Sys_Name, Vowel.WO.Stress) %>% mutate_at(vars(F1_onset, F2_onset, F3_onset, F4_onset, F5_onset, F1_midpoint, F2_midpoint, F3_midpoint, F4_midpoint, F5_midpoint), funs(dispersion = .- mean(.))) %>% ungroup()
CV_Merged = Combined_VowelData_CV %>% mutate_at(vars(ends_with("_dispersion")), abs)
Disp_Cols  = CV_Merged %>%  select(ends_with("_dispersion"))

print(summary(Disp_Cols)) ## There should be no negative values here!

CV_Merged$Delta_F1 = (CV_Merged$F1_onset - CV_Merged$F1_midpoint)
CV_Merged$Delta_F2 = (CV_Merged$F2_onset - CV_Merged$F2_midpoint)
CV_Merged$Delta_F3 = (CV_Merged$F3_onset - CV_Merged$F3_midpoint)
CV_Merged$Delta_F4 = (CV_Merged$F4_onset - CV_Merged$F4_midpoint)
CV_Merged$Delta_F5 = (CV_Merged$F5_onset - CV_Merged$F5_midpoint)


cols_to_update = c("Sys_Family","Sys_Quality", "Sys_CrossType")
CV_Merged[cols_to_update] = sapply(CV_Merged[cols_to_update], as.character)


##############################################################################################################
#1. Family
CV_Merged$Sys_Family[CV_Merged$Sys_Family == "0"] <- "Neural"
CV_Merged$Sys_Family[CV_Merged$Sys_Family == "HMM_WP"] <- "HMM"
CV_Merged$Sys_Family[CV_Merged$Sys_Family == "HMM_P"] <- "HMM"
CV_Merged$Sys_Family = as.factor(CV_Merged$Sys_Family)

#2. Quality
CV_Merged$Sys_Quality[CV_Merged$Sys_Quality == "0"] <- "Neural"
CV_Merged$Sys_Quality = as.factor(CV_Merged$Sys_Quality)

#3. CrossType
CV_Merged$Sys_CrossType[CV_Merged$Sys_CrossType == "0"] <- "Neural"
CV_Merged$Sys_CrossType = as.factor(CV_Merged$Sys_CrossType)
##############################################################################################################

CV_Merged$Vowel_Duration = CV_Merged$Vowel_Duration * 1000 

CV_Merged$Sys.Fam = factor(CV_Merged$Sys_Family, levels = c("Natural", "Hybrid", "UnitSel", "HMM", "Neural"))

CV_Merged$Sys.Name = factor(CV_Merged$Sys_Name, levels = c("A", "M", "K", "I", "C", "L", "N", "Q", "R", "X", "Y", "Z"))

CV_Merged$Sys.Qual = factor(CV_Merged$Sys_Quality, levels = c("Natural","R1","R2","R3","R4", "Neu-R1"))

CV_Merged$Sys.Fam.CrossType = factor(CV_Merged$Sys_CrossType, levels = (c("Natural","Hybrid-R1","UnitSel-R2","UnitSel-R3","HMM-R2","HMM-R3", "HMM-R4", "Neural-MR", "Neural-WN", "Neural-GA")))

CV_Merged_mDipCons = subset(CV_Merged, SegmentType == "Vowel")
CV_Merged = droplevels(CV_Merged_mDipCons)
#colnames(CV_Merged)
iname = enumerate(colnames(CV_Merged))
cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
output = "../text_data/V.06.Merged.Vowels.CV.txt"
summary(CV_Merged)
write.table(CV_Merged, file = output, row.names=FALSE, sep="\t")
print("Written to file Vowels CV")

               Filename       Word.Index         Word       Phoneme.Index   
 booksent_2013_0027:  312   Min.   : 0.00   the    : 1548   Min.   : 1.000  
 booksent_2013_0033:  276   1st Qu.: 5.00   her    :  600   1st Qu.: 1.000  
 booksent_2013_0076:  276   Median :10.00   to     :  588   Median : 1.000  
 booksent_2013_0070:  264   Mean   :10.46   she    :  516   Mean   : 1.787  
 booksent_2013_0073:  264   3rd Qu.:16.00   he     :  360   3rd Qu.: 3.000  
 booksent_2013_0014:  252   Max.   :32.00   had    :  300   Max.   :11.000  
 (Other)           :15480                   (Other):13212                   
     Vowel        Obstruent    Vowel_Height  Vowel_Frontness    Vowel_Type   
 AH0    :3648   T      :2688   0     :1992   0      :1992    Dipthong: 1992  
 EH1    :1332   DH     :2652   High  :5316   Back   :1896    Vowel   :15132  
 IH0    :1308   HH     :2508   Low   :3708   Central:6108                    
 IY1    :1284   S      :1368   Medial:6108   Front  :7128               

“`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))


 F1_onset_dispersion F2_onset_dispersion F3_onset_dispersion
 Min.   :   0.0005   Min.   :   0.0034   Min.   :   0.0664  
 1st Qu.:  22.1041   1st Qu.:  52.5534   1st Qu.:  86.5599  
 Median :  49.7577   Median : 114.2275   Median : 183.3996  
 Mean   :  84.0512   Mean   : 162.2155   Mean   : 241.1474  
 3rd Qu.:  98.9662   3rd Qu.: 209.0627   3rd Qu.: 328.6101  
 Max.   :1778.5051   Max.   :1523.6559   Max.   :1529.2382  
                                                            
 F4_onset_dispersion F5_onset_dispersion F1_midpoint_dispersion
 Min.   :   0.0037   Min.   :   0.004    Min.   :   0.0007     
 1st Qu.: 145.9733   1st Qu.: 135.403    1st Qu.:  21.7288     
 Median : 315.6295   Median : 285.218    Median :  48.9447     
 Mean   : 363.3019   Mean   : 340.815    Mean   :  68.7551     
 3rd Qu.: 548.4902   3rd Qu.: 496.219    3rd Qu.:  91.6154     
 Max.   :1570.6459   Max.   :1484.562    Max.   :1783.8765     
                     NA's   :6347                              


               Filename       Word.Index         Word       Phoneme.Index   
 booksent_2013_0027:  300   Min.   : 0.00   the    : 1548   Min.   : 1.000  
 booksent_2013_0070:  264   1st Qu.: 4.00   her    :  600   1st Qu.: 1.000  
 booksent_2013_0014:  240   Median :10.00   to     :  588   Median : 1.000  
 booksent_2013_0073:  240   Mean   :10.36   she    :  516   Mean   : 1.821  
 booksent_2013_0076:  240   3rd Qu.:16.00   he     :  360   3rd Qu.: 3.000  
 booksent_2013_0018:  228   Max.   :32.00   had    :  300   Max.   :11.000  
 (Other)           :13620                   (Other):11220                   
     Vowel        Obstruent    Vowel_Height  Vowel_Frontness Vowel_Type   
 AH0    :3648   DH     :2412   High  :5316   Back   :1896    Vowel:15132  
 EH1    :1332   HH     :2352   Low   :3708   Central:6108                 
 IH0    :1308   T      :2352   Medial:6108   Front  :7128                 
 IY1    :1284   S      :1188                                              
 AE1    :

[1] "Written to file Vowels CV"


# Heat maps --  vowels VC

In [16]:
vwl_dur_vc = read.csv("../text_data/V.01.Duration.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_amp_vc = read.csv("../text_data/V.02.Amplitude.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_spec_vc = read.csv("../text_data/V.03.Spectrals.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_lowF_vc = read.csv("../text_data/V.04.Lower.Formants.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
vwl_highF_vc = read.csv("../text_data/V.05.Higher.Formants.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")

vwl_lowF_vc = vwl_lowF_vc %>% rename(Word.Index = Word_Index, Phoneme.Index = Phoneme_Index)
vwl_highF_vc = vwl_highF_vc %>% rename(Word.Index = Word_Index, Phoneme.Index = Phoneme_Index)

In [18]:
identity_cols <- c("Filename", "Word.Index", "Word", "Phoneme.Index", "Vowel", "Sys_Name")

## Create a unique field using ^^ identity_cols
SortUniqueId <- function(df) {
    df$UnID = do.call(paste, c(df[identity_cols], sep = "-"))
    df = df[order(df$UnID),]
}

Vwl.Duration_VC = SortUniqueId(vwl_dur_vc)
Vwl.Amplitude_VC = SortUniqueId(vwl_amp_vc)
Vwl.Spectrals_VC = SortUniqueId(vwl_spec_vc)
Vwl.LowF_VC = SortUniqueId(vwl_lowF_vc)
Vwl.HighF_VC = SortUniqueId(vwl_highF_vc)

#### Merge the data ##
Merged_VowelData_VC = Reduce(merge, list(Vwl.Duration_VC, Vwl.Amplitude_VC, Vwl.Spectrals_VC, Vwl.LowF_VC, Vwl.HighF_VC))
Merged_VowelData_VC = select(Merged_VowelData_VC, -F3_band_offset, -F3_band_midpoint, -F5_band_offset, -F5_band_midpoint)

summary(Merged_VowelData_VC$Sys_Name) ## This is important to see if all the systems have the same number of data points

### Display all the column names with their indices
iname = enumerate(colnames(Merged_VowelData_VC))
#cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
Combined_VowelData_VC = Merged_VowelData_VC %>% group_by(Sys_Name, Vowel.WO.Stress) %>% mutate_at(vars(F1_offset, F2_offset, F3_offset, F4_offset, F5_offset, F1_midpoint, F2_midpoint, F3_midpoint, F4_midpoint, F5_midpoint), funs(dispersion = .- mean(.))) %>% ungroup()
VC_Merged = Combined_VowelData_VC %>% mutate_at(vars(ends_with("_dispersion")), abs)
Disp_Cols  = VC_Merged %>%  select(ends_with("_dispersion"))

#print(summary(Disp_Cols)) ## There should be no negative values here!

VC_Merged$Delta_F1 = (VC_Merged$F1_offset - VC_Merged$F1_midpoint)
VC_Merged$Delta_F2 = (VC_Merged$F2_offset - VC_Merged$F2_midpoint)
VC_Merged$Delta_F3 = (VC_Merged$F3_offset - VC_Merged$F3_midpoint)
VC_Merged$Delta_F4 = (VC_Merged$F4_offset - VC_Merged$F4_midpoint)
VC_Merged$Delta_F5 = (VC_Merged$F5_offset - VC_Merged$F5_midpoint)


cols_to_update = c("Sys_Family","Sys_Quality", "Sys_CrossType")
VC_Merged[cols_to_update] = sapply(VC_Merged[cols_to_update], as.character)


##############################################################################################################
#1. Family
VC_Merged$Sys_Family[VC_Merged$Sys_Family == "0"] <- "Neural"
VC_Merged$Sys_Family[VC_Merged$Sys_Family == "HMM_WP"] <- "HMM"
VC_Merged$Sys_Family[VC_Merged$Sys_Family == "HMM_P"] <- "HMM"
VC_Merged$Sys_Family = as.factor(VC_Merged$Sys_Family)

#2. Quality
VC_Merged$Sys_Quality[VC_Merged$Sys_Quality == "0"] <- "Neural"
VC_Merged$Sys_Quality = as.factor(VC_Merged$Sys_Quality)

#3. CrossType
VC_Merged$Sys_CrossType[VC_Merged$Sys_CrossType == "0"] <- "Neural"
VC_Merged$Sys_CrossType = as.factor(VC_Merged$Sys_CrossType)
##############################################################################################################

VC_Merged$Vowel_Duration = VC_Merged$Vowel_Duration * 1000 

VC_Merged$Sys.Fam = factor(VC_Merged$Sys_Family, levels = c("Natural", "Hybrid", "UnitSel", "HMM", "Neural"))

VC_Merged$Sys.Name = factor(VC_Merged$Sys_Name, levels = c("A", "M", "K", "I", "C", "L", "N", "Q", "R", "X", "Y", "Z"))

VC_Merged$Sys.Qual = factor(VC_Merged$Sys_Quality, levels = c("Natural","R1","R2","R3","R4", "Neu-R1"))

VC_Merged$Sys.Fam.CrossType = factor(VC_Merged$Sys_CrossType, levels = (c("Natural","Hybrid-R1","UnitSel-R2","UnitSel-R3","HMM-R2","HMM-R3", "HMM-R4", "Neural-MR", "Neural-WN", "Neural-GA")))

#VC_Merged_mP = subset(VC_Merged, Sys.Name != "P")
VC_Merged_mDipCons = subset(VC_Merged, SegmentType == "Vowel")
VC_Merged = droplevels(VC_Merged_mDipCons)
#colnames(VC_Merged)
iname = enumerate(colnames(VC_Merged))
cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
output = "../text_data/V.06.Merged.Vowels.VC.txt"
summary(VC_Merged)
           
write.table(VC_Merged, file = output, row.names=FALSE, sep="\t")
print("Written to file Vowels VC")
identical(CV_Merged, VC_Merged)



1 -> Filename
2 -> Word.Index
3 -> Word
4 -> Phoneme.Index
5 -> Vowel
6 -> Obstruent
7 -> Vowel_Height
8 -> Vowel_Frontness
9 -> Vowel_Type
10 -> Manner
11 -> Voicing
12 -> Place
13 -> Sibilance
14 -> Posteriority
15 -> Sys_Name
16 -> Sys_Quality_BL
17 -> Sys_Quality
18 -> Sys_Family
19 -> Sys_CrossType
20 -> UnID
21 -> Sequence_Type
22 -> Vowel.WO.Stress
23 -> StressType
24 -> SegmentType
25 -> LeftContext
26 -> Duration
27 -> Context
28 -> Vowel_Duration
29 -> RMS_Amplitude
30 -> Spectral.Tilt
31 -> F1_midpoint
32 -> F2_midpoint
33 -> F0_offset
34 -> F1_offset
35 -> F2_offset
36 -> Unnamed..0
37 -> F3_midpoint
38 -> F4_midpoint
39 -> F5_midpoint
40 -> F3_offset
41 -> F4_offset
42 -> F5_offset
43 -> Rel_Amp_F3
44 -> Rel_Amp_F4
45 -> Rel_Amp_F5
46 -> F1_offset_dispersion
47 -> F2_offset_dispersion
48 -> F3_offset_dispersion
49 -> F4_offset_dispersion
50 -> F5_offset_dispersion
51 -> F1_midpoint_dispersion
52 -> F2_midpoint_dispersion
53 -> F3_midpoint_dispersion
54 -> F4_midpoint_dispe

               Filename      Word.Index         Word      Phoneme.Index  
 booksent_2013_0070: 204   Min.   : 0.00   of     : 588   Min.   :0.000  
 booksent_2013_0096: 204   1st Qu.: 5.00   was    : 384   1st Qu.:1.000  
 booksent_2013_0018: 192   Median :10.00   it     : 360   Median :1.000  
 booksent_2013_0027: 180   Mean   :10.56   had    : 300   Mean   :1.422  
 booksent_2013_0061: 180   3rd Qu.:16.00   his    : 288   3rd Qu.:2.000  
 booksent_2013_0067: 180   Max.   :31.00   with   : 252   Max.   :9.000  
 (Other)           :9828                   (Other):8796                  
     Vowel        Obstruent    Vowel_Height  Vowel_Frontness Vowel_Type   
 AH0    :2328   T      :2112   High  :4452   Back   :1236    Vowel:10968  
 IH0    :1896   D      :1596   Low   :2832   Central:3684                 
 AE1    :1344   Z      :1356   Medial:3684   Front  :6048                 
 IH1    : 984   S      :1224                                              
 AH1    : 912   V      :1056     

[1] "Written to file Vowels VC"


# Heat maps --  consonants CV

In [20]:
cons_dur_cv = read.csv("../text_data/C.01.Duration.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
cons_amp_cv = read.csv("../text_data/C.02.Amplitude.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
cons_spec_cv = read.csv("../text_data/C.03.Spectrals.All.Systems.CV.txt", sep = '\t', header = TRUE, na.strings = "")
colnames(cons_dur_cv)
identity_cols <- c("Filename", "Word.Index", "Word", "Phoneme.Index", "Phoneme", "Sys_Name")

## Create a unique field using ^^ identity_cols
SortUniqueId <- function(df) {
    df$UnID = do.call(paste, c(df[identity_cols], sep = "-"))
    df = df[order(df$UnID),]
}

Cons.Duration_CV = SortUniqueId(cons_dur_cv)
Cons.Amplitude_CV = SortUniqueId(cons_amp_cv)
Cons.Spectrals_CV = SortUniqueId(cons_spec_cv)

#### Merge the data ##
Merged_ConsData_CV = Reduce(merge, list(Cons.Duration_CV, Cons.Amplitude_CV, Cons.Spectrals_CV))

summary(Merged_ConsData_CV$Sys_Name) ## This is important to see if all the systems have the same number of data points

### Display all the column names with their indices
iname = enumerate(colnames(Merged_ConsData_CV))
#cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
Cons_CV_Merged = Merged_ConsData_CV 

cols_to_update = c("Sys_Family","Sys_Quality", "Sys_CrossType")
Cons_CV_Merged[cols_to_update] = sapply(Cons_CV_Merged[cols_to_update], as.character)
#sapply(Cons_CV_Merged, class)

#1. Family
Cons_CV_Merged$Sys_Family[Cons_CV_Merged$Sys_Family == "HMM_WP"] <- "HMM"
Cons_CV_Merged$Sys_Family[Cons_CV_Merged$Sys_Family == "HMM_P"] <- "HMM"
Cons_CV_Merged$Sys_Family = as.factor(Cons_CV_Merged$Sys_Family)
print(summary(Cons_CV_Merged$Sys_Family))

#2. Quality
Cons_CV_Merged$Sys_Quality[Cons_CV_Merged$Sys_Quality == "0"] <- "Neural"
Cons_CV_Merged$Sys_Quality = as.factor(Cons_CV_Merged$Sys_Quality)
print(summary(Cons_CV_Merged$Sys_Quality))

#3. CrossType
Cons_CV_Merged$Sys_CrossType[Cons_CV_Merged$Sys_CrossType == "0"] <- "Neural"
Cons_CV_Merged$Sys_CrossType = as.factor(Cons_CV_Merged$Sys_CrossType)
print(summary(Cons_CV_Merged$Sys_CrossType))
##############################################################################################################

Cons_CV_Merged$Sys.Fam = factor(Cons_CV_Merged$Sys_Family, levels = c("Natural", "Hybrid", "UnitSel", "HMM", "Neural"))

Cons_CV_Merged$Sys.Name = factor(Cons_CV_Merged$Sys_Name, levels = c("A", "M", "K", "I", "C", "L", "N", "Q", "R", "X", "Y", "Z"))

Cons_CV_Merged$Sys.Qual = factor(Cons_CV_Merged$Sys_Quality, levels = c("Natural","R1","R2","R3","R4", "Neu-R1"))

Cons_CV_Merged$Sys.Fam.CrossType = factor(Cons_CV_Merged$Sys_CrossType, levels = (c("Natural","Hybrid-R1","UnitSel-R2","UnitSel-R3","HMM-R2","HMM-R3", "HMM-R4", "Neural-MR", "Neural-WN", "Neural-GA")))

iname = enumerate(colnames(Cons_CV_Merged))
cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
output = "../text_data/C.04.Merged.Consonants.CV.txt"
summary(Cons_CV_Merged)
write.table(Cons_CV_Merged, file = output, row.names=FALSE, sep="\t")
print("Written to file Consonants CV")
identical(VC_Merged, Cons_CV_Merged)
identical(CV_Merged, Cons_CV_Merged)


    HMM  Hybrid Natural  Neural UnitSel 
   2548    2548    1274    6370    2548 
Natural  Neu-R1      R1      R2 
   1274    6370    2548    5096 
    HMM-R2  Hybrid-R1    Natural  Neural-GA  Neural-MR  Neural-WN UnitSel-R2 
      2548       2548       1274       2548       1274       2548       2548 
1 -> Filename
2 -> Word.Index
3 -> Word
4 -> Phoneme.Index
5 -> Phoneme
6 -> Manner
7 -> Voicing
8 -> Place
9 -> Sibilance
10 -> Posteriority
11 -> Sys_Name
12 -> Context
13 -> Sys_Quality_BL
14 -> Sys_Quality
15 -> Sys_Family
16 -> Sys_CrossType
17 -> UnID
18 -> Consonant_Duration
19 -> Closure_Duration
20 -> Noise_Duration
21 -> RMS_Amplitude
22 -> Burst_Amplitude
23 -> Peak_Presence
24 -> Peak.Amplitude
25 -> Peak.Frequency
26 -> Dyn.Amplitude
27 -> Spectral.Tilt
28 -> Spectral.Shape
29 -> Sys.Fam
30 -> Sys.Name
31 -> Sys.Qual
32 -> Sys.Fam.CrossType


               Filename       Word.Index         Word       Phoneme.Index   
 booksent_2013_0027:  300   Min.   : 0.00   the    : 1548   Min.   :0.0000  
 booksent_2013_0018:  252   1st Qu.: 4.00   her    :  600   1st Qu.:0.0000  
 booksent_2013_0076:  252   Median :10.00   to     :  588   Median :0.0000  
 booksent_2013_0040:  240   Mean   :10.35   she    :  516   Mean   :0.5542  
 booksent_2013_0089:  240   3rd Qu.:16.00   he     :  360   3rd Qu.:0.0000  
 booksent_2013_0041:  228   Max.   :32.00   had    :  300   Max.   :9.0000  
 (Other)           :13776                   (Other):11376                   
    Phoneme           Manner          Voicing              Place     
 DH     :2616   Affricate: 360   Voiced   :6060   Alveolar    :4176  
 HH     :2496   Fricative:8952   Voiceless:9228   Bilabial    :1908  
 T      :1872   Stop     :5976                    Dental      :2784  
 B      :1236                                     Glottal     :2496  
 S      :1224                     

[1] "Written to file Consonants CV"


# Heat maps - Consonants VC

In [21]:
cons_dur_vc = read.csv("../text_data/C.01.Duration.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
cons_amp_vc = read.csv("../text_data/C.02.Amplitude.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
cons_spec_vc = read.csv("../text_data/C.03.Spectrals.All.Systems.VC.txt", sep = '\t', header = TRUE, na.strings = "")
colnames(cons_dur_vc)
identity_cols <- c("Filename", "Word.Index", "Word", "Phoneme.Index", "Phoneme", "Sys_Name")

## Create a unique field using ^^ identity_cols
SortUniqueId <- function(df) {
    df$UnID = do.call(paste, c(df[identity_cols], sep = "-"))
    df = df[order(df$UnID),]
}

Cons.Duration_VC = SortUniqueId(cons_dur_vc)
Cons.Amplitude_VC = SortUniqueId(cons_amp_vc)
Cons.Spectrals_VC = SortUniqueId(cons_spec_vc)

#### Merge the data ##
Merged_ConsData_VC = Reduce(merge, list(Cons.Duration_VC, Cons.Amplitude_VC, Cons.Spectrals_VC))

summary(Merged_ConsData_VC$Sys_Name) ## This is important to see if all the systems have the same number of data points

### Display all the column names with their indices
iname = enumerate(colnames(Merged_ConsData_VC))
#cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
Cons_VC_Merged = Merged_ConsData_VC 

cols_to_update = c("Sys_Family","Sys_Quality", "Sys_CrossType")
Cons_VC_Merged[cols_to_update] = sapply(Cons_VC_Merged[cols_to_update], as.character)
#sapply(Cons_VC_Merged, class)

#1. Family
Cons_VC_Merged$Sys_Family[Cons_VC_Merged$Sys_Family == "HMM_WP"] <- "HMM"
Cons_VC_Merged$Sys_Family[Cons_VC_Merged$Sys_Family == "HMM_P"] <- "HMM"
Cons_VC_Merged$Sys_Family = as.factor(Cons_VC_Merged$Sys_Family)
print(summary(Cons_VC_Merged$Sys_Family))

#2. Quality
Cons_VC_Merged$Sys_Quality[Cons_VC_Merged$Sys_Quality == "0"] <- "Neural"
Cons_VC_Merged$Sys_Quality = as.factor(Cons_VC_Merged$Sys_Quality)
print(summary(Cons_VC_Merged$Sys_Quality))

#3. CrossType
Cons_VC_Merged$Sys_CrossType[Cons_VC_Merged$Sys_CrossType == "0"] <- "Neural"
Cons_VC_Merged$Sys_CrossType = as.factor(Cons_VC_Merged$Sys_CrossType)
print(summary(Cons_VC_Merged$Sys_CrossType))
##############################################################################################################

Cons_VC_Merged$Sys.Fam = factor(Cons_VC_Merged$Sys_Family, levels = c("Natural", "Hybrid", "UnitSel", "HMM", "Neural"))

Cons_VC_Merged$Sys.Name = factor(Cons_VC_Merged$Sys_Name, levels = c("A", "M", "K", "I", "C", "L", "N", "Q", "R", "X", "Y", "Z"))

Cons_VC_Merged$Sys.Qual = factor(Cons_VC_Merged$Sys_Quality, levels = c("Natural","R1","R2","R3","R4", "Neu-R1"))

Cons_VC_Merged$Sys.Fam.CrossType = factor(Cons_VC_Merged$Sys_CrossType, levels = (c("Natural","Hybrid-R1","UnitSel-R2","UnitSel-R3","HMM-R2","HMM-R3", "HMM-R4", "Neural-MR", "Neural-WN", "Neural-GA")))

iname = enumerate(colnames(Cons_VC_Merged))
cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
output = "../text_data/C.04.Merged.Consonants.VC.txt"
summary(Cons_VC_Merged)
write.table(Cons_VC_Merged, file = output, row.names=FALSE, sep="\t")
print("Written to file Consonants VC")
identical(VC_Merged, Cons_VC_Merged)
identical(CV_Merged, Cons_VC_Merged)
identical(Cons_CV_Merged, Cons_VC_Merged)


    HMM  Hybrid Natural  Neural UnitSel 
   2470    2470    1235    6175    2470 
Natural  Neu-R1      R1      R2 
   1235    6175    2470    4940 
    HMM-R2  Hybrid-R1    Natural  Neural-GA  Neural-MR  Neural-WN UnitSel-R2 
      2470       2470       1235       2470       1235       2470       2470 
1 -> Filename
2 -> Word.Index
3 -> Word
4 -> Phoneme.Index
5 -> Phoneme
6 -> Manner
7 -> Voicing
8 -> Place
9 -> Sibilance
10 -> Posteriority
11 -> Sys_Name
12 -> Context
13 -> Sys_Quality_BL
14 -> Sys_Quality
15 -> Sys_Family
16 -> Sys_CrossType
17 -> UnID
18 -> Consonant_Duration
19 -> Closure_Duration
20 -> Noise_Duration
21 -> RMS_Amplitude
22 -> Burst_Amplitude
23 -> Peak_Presence
24 -> Peak.Amplitude
25 -> Peak.Frequency
26 -> Dyn.Amplitude
27 -> Spectral.Tilt
28 -> Spectral.Shape
29 -> Sys.Fam
30 -> Sys.Name
31 -> Sys.Qual
32 -> Sys.Fam.CrossType


               Filename       Word.Index         Word       Phoneme.Index   
 booksent_2013_0070:  264   Min.   : 0.00   of     :  588   Min.   : 1.000  
 booksent_2013_0073:  252   1st Qu.: 5.00   was    :  384   1st Qu.: 2.000  
 booksent_2013_0096:  252   Median :10.00   it     :  360   Median : 2.000  
 booksent_2013_0027:  240   Mean   :10.74   had    :  300   Mean   : 2.486  
 booksent_2013_0061:  228   3rd Qu.:16.00   his    :  288   3rd Qu.: 3.000  
 booksent_2013_0018:  216   Max.   :31.00   with   :  252   Max.   :10.000  
 (Other)           :13368                   (Other):12648                   
    Phoneme           Manner          Voicing              Place     
 T      :3480   Affricate: 516   Voiced   :6288   Alveolar    :8904  
 D      :2124   Fricative:6060   Voiceless:8532   Bilabial    :1068  
 Z      :1668   Stop     :8244                    Dental      : 624  
 S      :1632                                     Glottal     :  48  
 V      :1284                     

[1] "Written to file Consonants VC"


In [33]:
cons_dur_isl = read.csv("../text_data/C.01.Duration.All.Systems.ISL.txt", sep = '\t', header = TRUE, na.strings = "")
cons_amp_isl = read.csv("../text_data/C.02.Amplitude.All.Systems.ISL.txt", sep = '\t', header = TRUE, na.strings = "")
cons_spec_isl = read.csv("../text_data/C.03.Spectrals.All.Systems.ISL.txt", sep = '\t', header = TRUE, na.strings = "")
colnames(cons_dur_isl)
identity_cols <- c("Filename", "Word.Index", "Word", "Phoneme.Index", "Phoneme", "Sys_Name")

## Create a unique field using ^^ identity_cols
SortUniqueId <- function(df) {
    df$UnID = do.call(paste, c(df[identity_cols], sep = "-"))
    df = df[order(df$UnID),]
}

Cons.Duration_ISL = SortUniqueId(cons_dur_isl)
Cons.Amplitude_ISL = SortUniqueId(cons_amp_isl)
Cons.Spectrals_ISL = SortUniqueId(cons_spec_isl)
summary(Cons.Duration_ISL)

#### Merge the data ##
Merged_ConsData_ISL = Reduce(merge, list(Cons.Duration_ISL, Cons.Amplitude_ISL, Cons.Spectrals_ISL))

summary(Merged_ConsData_ISL$Sys_Name) ## This is important to see if all the systems have the same number of data points

### Display all the column names with their indices
iname = enumerate(colnames(Merged_ConsData_ISL))
#cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
Cons_ISL_Merged = Merged_ConsData_ISL 

cols_to_update = c("Sys_Family","Sys_Quality", "Sys_CrossType")
Cons_ISL_Merged[cols_to_update] = sapply(Cons_ISL_Merged[cols_to_update], as.character)
#sapply(Cons_ISL_Merged, class)

#1. Family
Cons_ISL_Merged$Sys_Family[Cons_ISL_Merged$Sys_Family == "HMM_WP"] <- "HMM"
Cons_ISL_Merged$Sys_Family[Cons_ISL_Merged$Sys_Family == "HMM_P"] <- "HMM"
Cons_ISL_Merged$Sys_Family = as.factor(Cons_ISL_Merged$Sys_Family)
print(summary(Cons_ISL_Merged$Sys_Family))

#2. Quality
Cons_ISL_Merged$Sys_Quality[Cons_ISL_Merged$Sys_Quality == "0"] <- "Neural"
Cons_ISL_Merged$Sys_Quality = as.factor(Cons_ISL_Merged$Sys_Quality)
print(summary(Cons_ISL_Merged$Sys_Quality))

#3. CrossType
Cons_ISL_Merged$Sys_CrossType[Cons_ISL_Merged$Sys_CrossType == "0"] <- "Neural"
Cons_ISL_Merged$Sys_CrossType = as.factor(Cons_ISL_Merged$Sys_CrossType)
print(summary(Cons_ISL_Merged$Sys_CrossType))
##############################################################################################################

Cons_ISL_Merged$Sys.Fam = factor(Cons_ISL_Merged$Sys_Family, levels = c("Natural", "Hybrid", "UnitSel", "HMM", "Neural"))

Cons_ISL_Merged$Sys.Name = factor(Cons_ISL_Merged$Sys_Name, levels = c("A", "M", "K", "I", "C", "L", "N", "Q", "R", "X", "Y", "Z"))

Cons_ISL_Merged$Sys.Qual = factor(Cons_ISL_Merged$Sys_Quality, levels = c("Natural","R1","R2","R3","R4", "Neu-R1"))

Cons_ISL_Merged$Sys.Fam.CrossType = factor(Cons_ISL_Merged$Sys_CrossType, levels = (c("Natural","Hybrid-R1","UnitSel-R2","UnitSel-R3","HMM-R2","HMM-R3", "HMM-R4", "Neural-MR", "Neural-WN", "Neural-GA")))

iname = enumerate(colnames(Cons_ISL_Merged))
cat(sapply(iname, function(n) sprintf("%d -> %s\n", n$index, n$value)), sep = "")
output = "../text_data/C.04.Merged.Consonants.ISL.txt"
summary(Cons_ISL_Merged)
write.table(Cons_ISL_Merged, file = output, row.names=FALSE, sep="\t")
print("Written to file Consonants ISL")
identical(CV_Merged, Cons_ISL_Merged)
identical(VC_Merged, Cons_ISL_Merged)
identical(Cons_CV_Merged, Cons_ISL_Merged)
identical(Cons_VC_Merged, Cons_ISL_Merged)


               Filename      Word.Index         Word      Phoneme.Index   
 booksent_2013_0023: 144   Min.   : 0.00   and    :1032   Min.   : 0.000  
 booksent_2013_0026: 144   1st Qu.: 6.00   for    : 120   1st Qu.: 0.000  
 booksent_2013_0050: 144   Median :10.00   against:  72   Median : 3.000  
 booksent_2013_0008: 132   Mean   :11.08   front  :  72   Mean   : 2.431  
 booksent_2013_0017: 132   3rd Qu.:16.00   stood  :  72   3rd Qu.: 3.000  
 booksent_2013_0025: 132   Max.   :31.00   found  :  60   Max.   :11.000  
 (Other)           :6720                   (Other):6120                   
    Phoneme     Consonant_Duration Closure_Duration Noise_Duration 
 D      :2184   Min.   : 30.00     Min.   :     0   Min.   :  4.0  
 T      :1512   1st Qu.: 50.00     1st Qu.:     0   1st Qu.: 19.0  
 S      :1248   Median : 70.00     Median : 23000   Median : 38.0  
 F      : 516   Mean   : 74.15     Mean   : 23348   Mean   : 50.8  
 K      : 468   3rd Qu.: 90.00     3rd Qu.: 38000   3rd Qu.:

               Filename   Word.Index            Word   Phoneme.Index
 booksent_2013_0001:0   Min.   : NA   acceptance  :0   Min.   : NA  
 booksent_2013_0002:0   1st Qu.: NA   acquaintance:0   1st Qu.: NA  
 booksent_2013_0003:0   Median : NA   advanced    :0   Median : NA  
 booksent_2013_0004:0   Mean   :NaN   against     :0   Mean   :NaN  
 booksent_2013_0005:0   3rd Qu.: NA   almost      :0   3rd Qu.: NA  
 booksent_2013_0006:0   Max.   : NA   and         :0   Max.   : NA  
 (Other)           :0                 (Other)     :0                
    Phoneme        Manner       Voicing           Place          Sibilance
 B      :0   Affricate:0   Voiced   :0   Alveolar    :0   0           :0  
 CH     :0   Fricative:0   Voiceless:0   Bilabial    :0   Non_Sibilant:0  
 D      :0   Stop     :0                 Dental      :0   Sibilant    :0  
 F      :0                               Glottal     :0                   
 G      :0                               Labiodental :0                  