first upload

FredericBlum · Apr 17, 2023 · f9a1f36 · f9a1f36
1 parent f5a09a9
commit f9a1f36
Show file tree

Hide file tree

Showing 26 changed files with 101,851 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,104 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# MacOS stuff
+.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+# Exploring individual variation in Turkish heritage speakers’ complex linguistic productions
+
+This repository contains all the code and data necessary to replicate the findings from our study.
+
+## Models
+
+The models were to big in size in order to be included in this GitHub Release. You can download them from OSF: [https://www.doi.org/10.17605/OSF.IO/6ZCXU](https://www.doi.org/10.17605/OSF.IO/6ZCXU)
+
+## Short guide to replication
+
+You can either access the data from the folder `scripts/data/`, or download them as explained in the next section. In the next step, open the folder `scripts/` and proceed to run the scripts in the indicated order.
+
+## How to download the original data
+
+In a first step, please open the following link:
+
+[https://korpling.german.hu-berlin.de/annis3/#c=rueg](https://korpling.german.hu-berlin.de/annis3/#c=rueg)
+
+Then, select the RUEG-TR_1.0-SNAPSHOT corpus.
+
+In the search form, you can type the following:
+
+```
+norm _o_ pos_lang _o_ cu
+```
+
+And then export the data with the following additional settings:
+
+```
+Annotation Keys: norm
+Parameters: metakeys=doc
+```
+
+You should now have a CSV-file with all the data we have used in our study.
diff --git a/scripts/01_preprocessing.R b/scripts/01_preprocessing.R
@@ -0,0 +1,68 @@
+library(tidyverse)
+
+data <- read_tsv('data/export_01.txt') %>% 
+  select(-contains("span")) %>% 
+  rename(tok_id = "1_id",
+         tok = "1_anno_default_ns:norm",
+         doc = "1_meta_annis:doc",
+         pos_id = "2_id",
+         pos = "2_anno_default_ns:pos_lang",
+         cu_id = "3_id",
+         cu = "3_anno_default_ns:cu") 
+
+data <- data %>% 
+  mutate(
+    Group = case_when(
+      startsWith(doc, "DE") ~ "Germany",
+      startsWith(doc, "De") ~ "Germany",
+      startsWith(doc, "US") ~ "USA",
+      startsWith(doc, "Us") ~ "USA",
+      startsWith(doc, "TU") ~ "Turkey",
+      startsWith(doc, "Tu") ~ "Turkey"
+      ),
+    Register = case_when(
+      grepl("_f", doc) ~ "formal",
+      grepl("_i", doc) ~ "informal"
+      ),
+    mode = case_when(
+      grepl("_fs", doc) ~ "spoken",
+      grepl("_is", doc) ~ "spoken",
+      grepl("_fw", doc) ~ "written",
+      grepl("_iw", doc) ~ "written",
+    )) %>% 
+  mutate(Speaker = substr(doc, start=1, stop=6)) %>% 
+  filter(mode == "spoken")
+
+###########################
+###   Preprocessing     ###
+###########################
+data <- data %>%
+  mutate(discourse = ifelse(test = pos == "CO", yes = 1, no = 0),
+         hesitation = ifelse(test = pos == "CO" & tok == "e", yes = 1, no = 0))
+
+# filtering for multiple initial discourse/hesitation markers
+filtered <- tibble()
+for(i in 1:nrow(data)) {
+  if(data[i, "discourse"] == 1 & data[i+1, "discourse"] == 1 & 
+     # only filter if in same utterance
+     data[i, "cu_id"] == data[i+1, "cu_id"]){}
+  else{filtered <- filtered %>% rbind(data[i,])} 
+}
+
+for(i in 1:nrow(filtered)) {
+  filtered[i, "Position"] <- ifelse(
+   test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i-1, "cu_id"]), 
+   yes = "Initial", no = ifelse(
+     test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i+1, "cu_id"]), 
+     yes = "Final", no = "Medial"))
+}
+
+nums <- data %>% filter(discourse != 1) %>% group_by(cu_id) %>% mutate(utt_length = n())
+nums <- unique(nums[c("cu_id", "utt_length")])
+
+final_data <- filtered %>% 
+  left_join(nums, by = "cu_id") %>% 
+  mutate(utt_length = ifelse(is.na(utt_length), 0, utt_length)) %>% 
+  mutate(z_uttLength = (utt_length - mean(utt_length)) / sd(utt_length))
+
+write_csv(final_data, 'data/data_preprocessed_dis.csv')
diff --git a/scripts/02_dataExploration.R b/scripts/02_dataExploration.R
@@ -0,0 +1,44 @@
+library(tidyverse)
+library(ggdist)
+library(gghalves)
+library(viridis)
+library(xtable)
+
+data <- read_csv('data/data_preprocessed_dis.csv')
+
+data %>% filter(discourse==1) %>% group_by(tok) %>% count()
+
+data %>% group_by(utt_length, Group) %>% count(hesitation) %>% filter(hesitation == 1) %>% 
+  ggplot(aes(x = utt_length, y = n, color = Group)) +
+    geom_point() +
+    scale_color_viridis(discrete = TRUE, end = 0.7)
+
+data %>% ggplot(aes(x = Group, y = utt_length, color = Group, fill = Group)) +
+  geom_boxplot(width = .2, fill = "white", size = 1, outlier.shape = NA) +
+  geom_half_point(side = "l", range_scale = .25, alpha = .5, size = 0.1) +
+  stat_halfeye(adjust = 1, width = .5, color = NA, position = position_nudge(x = .15)) +
+  coord_flip() +
+  scale_fill_viridis(discrete = TRUE, end = 0.7) +
+  scale_color_viridis(discrete = TRUE, end = 0.7)
+
+## dis
+data_grouped_dis <- data %>% group_by(Group, Position, Register, discourse) %>% count() %>% 
+  pivot_wider(names_from = Position, values_from = n) %>% 
+  mutate(discourse = as.character(discourse),
+         total = Initial + Medial + Final) %>% 
+  arrange(discourse) %>% 
+  relocate(discourse, Group, Register,Initial, Medial, Final, total)
+
+count_dis <- data_grouped_dis %>% 
+  filter(discourse == 0)
+rel_count <- colSums(count_dis[, c(4:7)]) %>% as_tibble()
+
+
+print(xtable(data_grouped_dis), include.rownames=FALSE)
+
+## hes
+data_grouped_hes <- data %>% group_by(Group, Position, hesitation) %>% count() %>% 
+  mutate(hesitation = as.character(hesitation)) %>% 
+  pivot_wider(names_from = Position, values_from = n)
+
+print(xtable(data_grouped_hes), include.rownames=FALSE)
diff --git a/scripts/03_priors.R b/scripts/03_priors.R
@@ -0,0 +1,74 @@
+library(tidyverse)
+library(ggdist)
+#library(patchwork)
+library(viridis)
+
+n = 1e5
+set.seed(42)
+
+
+intercepts <- tibble(x = c(rnorm(n, -2.3, 0.3))) %>%
+  mutate(group = 'alpha%~% Normal(-3, 1)') %>% 
+  ggplot(aes(x = x)) +
+  geom_density(aes(fill = group)) +
+  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
+  scale_y_continuous(breaks = NULL) +
+  ylab("Density of values") +
+  scale_x_continuous(name = "Intercept values on log-scale", 
+                     #limits = c(-1.25, 1.25), 
+                     breaks = seq(from = -6, to = 1, by = 0.5)) +
+  theme(legend.position = "none",
+        plot.title = element_text(size = 14)) +
+  labs(title = "β ~ Normal(-3, 1)")
+
+#########################################
+###     influence of predictors       ###
+#########################################
+predictors <- tibble(x = c(rnorm(n, 0, 2))) %>%
+  mutate(group = 'beta%~% Normal(0, 0.3)') %>% 
+  ggplot(aes(x = x)) +
+  geom_density(aes(fill = group)) +
+  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
+  scale_y_continuous(breaks = NULL) +
+  ylab("Density of values") +
+  scale_x_continuous(name = "Predictor values on log-scale", 
+                     #limits = c(-5, 5), 
+                     breaks = seq(from = -5, to = 5, by = 0.5)) +
+  theme(legend.position = "none",
+        plot.title = element_text(size = 14)) +
+  labs(title = "β ~ Normal(0, 2)")
+
+# exp -5 = -0.006
+# exp -2 = 0.135s
+# exp 0 = 1
+
+sigma <- rexp(n, rate = 10) %>% 
+  tibble() %>% 
+  mutate(group = 'sigma%~% exp(10)') %>% 
+  ggplot(aes(x=.)) + 
+  geom_density(aes(fill = group)) +
+  scale_y_continuous(breaks = NULL,
+                     name = "Density of values") +
+  scale_x_continuous(breaks = seq(from = 0, to = 1.2, by = 0.2),
+                     #limits = c(0, 2),
+                     name = "Standard deviation of varying intercepts on log-scale") +
+  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
+  theme(legend.position = "none",
+        plot.title = element_text(size = 14)) +  
+  labs(title = "σ ~ Exp(8)")
+
+#########################################
+###     varying slopes matrix         ###
+#########################################
+lkjcorr <- rlkjcorr_marginal(n, K = 2, eta = 12) %>% tibble(x = .) %>% 
+  mutate(group = 'R%~% LKJcorr(12)') %>% 
+  ggplot(aes(x = x, fill = group)) + 
+  geom_density() +
+  scale_y_continuous(breaks = NULL) +
+  scale_x_continuous(name = "Correlation of varying intercepts and slopes",
+                     breaks = c(-1, -0.5, 0, 0.5, 1)) +
+  scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
+  theme(legend.position = "none",
+        plot.title = element_text(size = 14)) +
+  ylab("Density of values") +
+  labs(title = "R ~ LKJcorr(12)")
diff --git a/scripts/04_priorPredictions.R b/scripts/04_priorPredictions.R
@@ -0,0 +1,50 @@
+library(tidyverse)
+library(bayesplot)
+library(brms)
+library(patchwork)
+library(tidybayes)
+library(viridis)
+
+data <- read_csv('data/data_preprocessed_dis.csv') %>% mutate(Position = as.factor(Position))
+
+model_prior <- 
+  brm(data = data, family = bernoulli,
+      formula = discourse ~ 0 + Position + Register + z_uttLength + 
+        (0 + Position | Group/Speaker),
+      prior = c(prior(normal(0, 2), class = b),
+                prior(exponential(10), class = sd),
+                prior(lkj(12), class = cor)),    
+      iter = 5000, warmup = 2000, chains = 4, cores = 4,
+      control = list(adapt_delta = 0.98),
+      sample_prior = "only",
+      file = "models/grouped_samples.rds",
+      seed = 42)
+
+color_scheme_set("pink")
+
+para_vals <- posterior_summary(model_prior) %>% 
+  data.frame() %>% as_tibble(rownames = "parameter")
+
+hpdi_vals <- posterior_interval(model_prior, prob=0.89) %>% 
+  data.frame() %>% as_tibble(rownames = "parameter") %>% 
+  rename(hpdi_low = X5.5., hpdi_high = X94.5.)
+
+para_vals <- para_vals %>% left_join(hpdi_vals)
+
+
+raw_markers <- data %>% .$discourse
+
+if (file.exists("data/models/prior_pred.rds")) {
+  prior_markers <- readRDS(file = "models/prior_pred.rds")
+} else{
+  prior_markers <- posterior_predict(model_prior, ndraws = 200, 
+                                          cores = getOption("mc.cores", 4))
+  saveRDS(prior_markers, file = "models/prior_pred.rds")
+} 
+
+prior_overlay <- ppc_dens_overlay(raw_markers, prior_markers, 
+                                  alpha = 0.5, size = 0.7, adjust = 1)
+prior_overlay$scales$scales[[1]]$labels <- c("data", "prior")
+
+ggsave("images/prior_simData.png", prior_overlay, scale = 1.1,
+       width = 2000, height = 1400, units = "px")
diff --git a/scripts/05_model.R b/scripts/05_model.R
@@ -0,0 +1,18 @@
+library(tidyverse)
+library(brms)
+
+data <- read_csv('data/data_preprocessed_dis.csv') %>% 
+  mutate(Position=as.factor(Position))
+
+model_grouped <- brm(
+  data=data, family=bernoulli,
+  formula=discourse ~ 0 + Position + Register + z_uttLength + 
+    (0 + Position | Group/Speaker),
+  prior=c(prior(normal(0, 2), class=b),
+            prior(exponential(10), class=sd),
+            prior(lkj(12), class=cor)),    
+  iter=20000, warmup=4000, chains=8, cores=8,
+  control=list(adapt_delta=0.98),
+  file="models/dis_grouped.rds",
+  seed=42
+  )