-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f5a09a9
commit f9a1f36
Showing
26 changed files
with
101,851 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
env/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# dotenv | ||
.env | ||
|
||
# virtualenv | ||
.venv | ||
venv/ | ||
ENV/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
|
||
# MacOS stuff | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Exploring individual variation in Turkish heritage speakers’ complex linguistic productions | ||
|
||
This repository contains all the code and data necessary to replicate the findings from our study. | ||
|
||
## Models | ||
|
||
The models were to big in size in order to be included in this GitHub Release. You can download them from OSF: [https://www.doi.org/10.17605/OSF.IO/6ZCXU](https://www.doi.org/10.17605/OSF.IO/6ZCXU) | ||
|
||
## Short guide to replication | ||
|
||
You can either access the data from the folder `scripts/data/`, or download them as explained in the next section. In the next step, open the folder `scripts/` and proceed to run the scripts in the indicated order. | ||
|
||
## How to download the original data | ||
|
||
In a first step, please open the following link: | ||
|
||
[https://korpling.german.hu-berlin.de/annis3/#c=rueg](https://korpling.german.hu-berlin.de/annis3/#c=rueg) | ||
|
||
Then, select the RUEG-TR_1.0-SNAPSHOT corpus. | ||
|
||
In the search form, you can type the following: | ||
|
||
``` | ||
norm _o_ pos_lang _o_ cu | ||
``` | ||
|
||
And then export the data with the following additional settings: | ||
|
||
``` | ||
Annotation Keys: norm | ||
Parameters: metakeys=doc | ||
``` | ||
|
||
You should now have a CSV-file with all the data we have used in our study. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
library(tidyverse) | ||
|
||
data <- read_tsv('data/export_01.txt') %>% | ||
select(-contains("span")) %>% | ||
rename(tok_id = "1_id", | ||
tok = "1_anno_default_ns:norm", | ||
doc = "1_meta_annis:doc", | ||
pos_id = "2_id", | ||
pos = "2_anno_default_ns:pos_lang", | ||
cu_id = "3_id", | ||
cu = "3_anno_default_ns:cu") | ||
|
||
data <- data %>% | ||
mutate( | ||
Group = case_when( | ||
startsWith(doc, "DE") ~ "Germany", | ||
startsWith(doc, "De") ~ "Germany", | ||
startsWith(doc, "US") ~ "USA", | ||
startsWith(doc, "Us") ~ "USA", | ||
startsWith(doc, "TU") ~ "Turkey", | ||
startsWith(doc, "Tu") ~ "Turkey" | ||
), | ||
Register = case_when( | ||
grepl("_f", doc) ~ "formal", | ||
grepl("_i", doc) ~ "informal" | ||
), | ||
mode = case_when( | ||
grepl("_fs", doc) ~ "spoken", | ||
grepl("_is", doc) ~ "spoken", | ||
grepl("_fw", doc) ~ "written", | ||
grepl("_iw", doc) ~ "written", | ||
)) %>% | ||
mutate(Speaker = substr(doc, start=1, stop=6)) %>% | ||
filter(mode == "spoken") | ||
|
||
########################### | ||
### Preprocessing ### | ||
########################### | ||
data <- data %>% | ||
mutate(discourse = ifelse(test = pos == "CO", yes = 1, no = 0), | ||
hesitation = ifelse(test = pos == "CO" & tok == "e", yes = 1, no = 0)) | ||
|
||
# filtering for multiple initial discourse/hesitation markers | ||
filtered <- tibble() | ||
for(i in 1:nrow(data)) { | ||
if(data[i, "discourse"] == 1 & data[i+1, "discourse"] == 1 & | ||
# only filter if in same utterance | ||
data[i, "cu_id"] == data[i+1, "cu_id"]){} | ||
else{filtered <- filtered %>% rbind(data[i,])} | ||
} | ||
|
||
for(i in 1:nrow(filtered)) { | ||
filtered[i, "Position"] <- ifelse( | ||
test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i-1, "cu_id"]), | ||
yes = "Initial", no = ifelse( | ||
test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i+1, "cu_id"]), | ||
yes = "Final", no = "Medial")) | ||
} | ||
|
||
nums <- data %>% filter(discourse != 1) %>% group_by(cu_id) %>% mutate(utt_length = n()) | ||
nums <- unique(nums[c("cu_id", "utt_length")]) | ||
|
||
final_data <- filtered %>% | ||
left_join(nums, by = "cu_id") %>% | ||
mutate(utt_length = ifelse(is.na(utt_length), 0, utt_length)) %>% | ||
mutate(z_uttLength = (utt_length - mean(utt_length)) / sd(utt_length)) | ||
|
||
write_csv(final_data, 'data/data_preprocessed_dis.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
library(tidyverse) | ||
library(ggdist) | ||
library(gghalves) | ||
library(viridis) | ||
library(xtable) | ||
|
||
data <- read_csv('data/data_preprocessed_dis.csv') | ||
|
||
data %>% filter(discourse==1) %>% group_by(tok) %>% count() | ||
|
||
data %>% group_by(utt_length, Group) %>% count(hesitation) %>% filter(hesitation == 1) %>% | ||
ggplot(aes(x = utt_length, y = n, color = Group)) + | ||
geom_point() + | ||
scale_color_viridis(discrete = TRUE, end = 0.7) | ||
|
||
data %>% ggplot(aes(x = Group, y = utt_length, color = Group, fill = Group)) + | ||
geom_boxplot(width = .2, fill = "white", size = 1, outlier.shape = NA) + | ||
geom_half_point(side = "l", range_scale = .25, alpha = .5, size = 0.1) + | ||
stat_halfeye(adjust = 1, width = .5, color = NA, position = position_nudge(x = .15)) + | ||
coord_flip() + | ||
scale_fill_viridis(discrete = TRUE, end = 0.7) + | ||
scale_color_viridis(discrete = TRUE, end = 0.7) | ||
|
||
## dis | ||
data_grouped_dis <- data %>% group_by(Group, Position, Register, discourse) %>% count() %>% | ||
pivot_wider(names_from = Position, values_from = n) %>% | ||
mutate(discourse = as.character(discourse), | ||
total = Initial + Medial + Final) %>% | ||
arrange(discourse) %>% | ||
relocate(discourse, Group, Register,Initial, Medial, Final, total) | ||
|
||
count_dis <- data_grouped_dis %>% | ||
filter(discourse == 0) | ||
rel_count <- colSums(count_dis[, c(4:7)]) %>% as_tibble() | ||
|
||
|
||
print(xtable(data_grouped_dis), include.rownames=FALSE) | ||
|
||
## hes | ||
data_grouped_hes <- data %>% group_by(Group, Position, hesitation) %>% count() %>% | ||
mutate(hesitation = as.character(hesitation)) %>% | ||
pivot_wider(names_from = Position, values_from = n) | ||
|
||
print(xtable(data_grouped_hes), include.rownames=FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
library(tidyverse) | ||
library(ggdist) | ||
#library(patchwork) | ||
library(viridis) | ||
|
||
n = 1e5 | ||
set.seed(42) | ||
|
||
|
||
intercepts <- tibble(x = c(rnorm(n, -2.3, 0.3))) %>% | ||
mutate(group = 'alpha%~% Normal(-3, 1)') %>% | ||
ggplot(aes(x = x)) + | ||
geom_density(aes(fill = group)) + | ||
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) + | ||
scale_y_continuous(breaks = NULL) + | ||
ylab("Density of values") + | ||
scale_x_continuous(name = "Intercept values on log-scale", | ||
#limits = c(-1.25, 1.25), | ||
breaks = seq(from = -6, to = 1, by = 0.5)) + | ||
theme(legend.position = "none", | ||
plot.title = element_text(size = 14)) + | ||
labs(title = "β ~ Normal(-3, 1)") | ||
|
||
######################################### | ||
### influence of predictors ### | ||
######################################### | ||
predictors <- tibble(x = c(rnorm(n, 0, 2))) %>% | ||
mutate(group = 'beta%~% Normal(0, 0.3)') %>% | ||
ggplot(aes(x = x)) + | ||
geom_density(aes(fill = group)) + | ||
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) + | ||
scale_y_continuous(breaks = NULL) + | ||
ylab("Density of values") + | ||
scale_x_continuous(name = "Predictor values on log-scale", | ||
#limits = c(-5, 5), | ||
breaks = seq(from = -5, to = 5, by = 0.5)) + | ||
theme(legend.position = "none", | ||
plot.title = element_text(size = 14)) + | ||
labs(title = "β ~ Normal(0, 2)") | ||
|
||
# exp -5 = -0.006 | ||
# exp -2 = 0.135s | ||
# exp 0 = 1 | ||
|
||
sigma <- rexp(n, rate = 10) %>% | ||
tibble() %>% | ||
mutate(group = 'sigma%~% exp(10)') %>% | ||
ggplot(aes(x=.)) + | ||
geom_density(aes(fill = group)) + | ||
scale_y_continuous(breaks = NULL, | ||
name = "Density of values") + | ||
scale_x_continuous(breaks = seq(from = 0, to = 1.2, by = 0.2), | ||
#limits = c(0, 2), | ||
name = "Standard deviation of varying intercepts on log-scale") + | ||
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) + | ||
theme(legend.position = "none", | ||
plot.title = element_text(size = 14)) + | ||
labs(title = "σ ~ Exp(8)") | ||
|
||
######################################### | ||
### varying slopes matrix ### | ||
######################################### | ||
lkjcorr <- rlkjcorr_marginal(n, K = 2, eta = 12) %>% tibble(x = .) %>% | ||
mutate(group = 'R%~% LKJcorr(12)') %>% | ||
ggplot(aes(x = x, fill = group)) + | ||
geom_density() + | ||
scale_y_continuous(breaks = NULL) + | ||
scale_x_continuous(name = "Correlation of varying intercepts and slopes", | ||
breaks = c(-1, -0.5, 0, 0.5, 1)) + | ||
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) + | ||
theme(legend.position = "none", | ||
plot.title = element_text(size = 14)) + | ||
ylab("Density of values") + | ||
labs(title = "R ~ LKJcorr(12)") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
library(tidyverse) | ||
library(bayesplot) | ||
library(brms) | ||
library(patchwork) | ||
library(tidybayes) | ||
library(viridis) | ||
|
||
data <- read_csv('data/data_preprocessed_dis.csv') %>% mutate(Position = as.factor(Position)) | ||
|
||
model_prior <- | ||
brm(data = data, family = bernoulli, | ||
formula = discourse ~ 0 + Position + Register + z_uttLength + | ||
(0 + Position | Group/Speaker), | ||
prior = c(prior(normal(0, 2), class = b), | ||
prior(exponential(10), class = sd), | ||
prior(lkj(12), class = cor)), | ||
iter = 5000, warmup = 2000, chains = 4, cores = 4, | ||
control = list(adapt_delta = 0.98), | ||
sample_prior = "only", | ||
file = "models/grouped_samples.rds", | ||
seed = 42) | ||
|
||
color_scheme_set("pink") | ||
|
||
para_vals <- posterior_summary(model_prior) %>% | ||
data.frame() %>% as_tibble(rownames = "parameter") | ||
|
||
hpdi_vals <- posterior_interval(model_prior, prob=0.89) %>% | ||
data.frame() %>% as_tibble(rownames = "parameter") %>% | ||
rename(hpdi_low = X5.5., hpdi_high = X94.5.) | ||
|
||
para_vals <- para_vals %>% left_join(hpdi_vals) | ||
|
||
|
||
raw_markers <- data %>% .$discourse | ||
|
||
if (file.exists("data/models/prior_pred.rds")) { | ||
prior_markers <- readRDS(file = "models/prior_pred.rds") | ||
} else{ | ||
prior_markers <- posterior_predict(model_prior, ndraws = 200, | ||
cores = getOption("mc.cores", 4)) | ||
saveRDS(prior_markers, file = "models/prior_pred.rds") | ||
} | ||
|
||
prior_overlay <- ppc_dens_overlay(raw_markers, prior_markers, | ||
alpha = 0.5, size = 0.7, adjust = 1) | ||
prior_overlay$scales$scales[[1]]$labels <- c("data", "prior") | ||
|
||
ggsave("images/prior_simData.png", prior_overlay, scale = 1.1, | ||
width = 2000, height = 1400, units = "px") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
library(tidyverse) | ||
library(brms) | ||
|
||
data <- read_csv('data/data_preprocessed_dis.csv') %>% | ||
mutate(Position=as.factor(Position)) | ||
|
||
model_grouped <- brm( | ||
data=data, family=bernoulli, | ||
formula=discourse ~ 0 + Position + Register + z_uttLength + | ||
(0 + Position | Group/Speaker), | ||
prior=c(prior(normal(0, 2), class=b), | ||
prior(exponential(10), class=sd), | ||
prior(lkj(12), class=cor)), | ||
iter=20000, warmup=4000, chains=8, cores=8, | ||
control=list(adapt_delta=0.98), | ||
file="models/dis_grouped.rds", | ||
seed=42 | ||
) |
Oops, something went wrong.