Skip to content

Commit

Permalink
first upload
Browse files Browse the repository at this point in the history
  • Loading branch information
FredericBlum committed Apr 17, 2023
1 parent f5a09a9 commit f9a1f36
Show file tree
Hide file tree
Showing 26 changed files with 101,851 additions and 0 deletions.
104 changes: 104 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# MacOS stuff
.DS_Store
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Exploring individual variation in Turkish heritage speakers’ complex linguistic productions

This repository contains all the code and data necessary to replicate the findings from our study.

## Models

The models were to big in size in order to be included in this GitHub Release. You can download them from OSF: [https://www.doi.org/10.17605/OSF.IO/6ZCXU](https://www.doi.org/10.17605/OSF.IO/6ZCXU)

## Short guide to replication

You can either access the data from the folder `scripts/data/`, or download them as explained in the next section. In the next step, open the folder `scripts/` and proceed to run the scripts in the indicated order.

## How to download the original data

In a first step, please open the following link:

[https://korpling.german.hu-berlin.de/annis3/#c=rueg](https://korpling.german.hu-berlin.de/annis3/#c=rueg)

Then, select the RUEG-TR_1.0-SNAPSHOT corpus.

In the search form, you can type the following:

```
norm _o_ pos_lang _o_ cu
```

And then export the data with the following additional settings:

```
Annotation Keys: norm
Parameters: metakeys=doc
```

You should now have a CSV-file with all the data we have used in our study.
68 changes: 68 additions & 0 deletions scripts/01_preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
library(tidyverse)

data <- read_tsv('data/export_01.txt') %>%
select(-contains("span")) %>%
rename(tok_id = "1_id",
tok = "1_anno_default_ns:norm",
doc = "1_meta_annis:doc",
pos_id = "2_id",
pos = "2_anno_default_ns:pos_lang",
cu_id = "3_id",
cu = "3_anno_default_ns:cu")

data <- data %>%
mutate(
Group = case_when(
startsWith(doc, "DE") ~ "Germany",
startsWith(doc, "De") ~ "Germany",
startsWith(doc, "US") ~ "USA",
startsWith(doc, "Us") ~ "USA",
startsWith(doc, "TU") ~ "Turkey",
startsWith(doc, "Tu") ~ "Turkey"
),
Register = case_when(
grepl("_f", doc) ~ "formal",
grepl("_i", doc) ~ "informal"
),
mode = case_when(
grepl("_fs", doc) ~ "spoken",
grepl("_is", doc) ~ "spoken",
grepl("_fw", doc) ~ "written",
grepl("_iw", doc) ~ "written",
)) %>%
mutate(Speaker = substr(doc, start=1, stop=6)) %>%
filter(mode == "spoken")

###########################
### Preprocessing ###
###########################
data <- data %>%
mutate(discourse = ifelse(test = pos == "CO", yes = 1, no = 0),
hesitation = ifelse(test = pos == "CO" & tok == "e", yes = 1, no = 0))

# filtering for multiple initial discourse/hesitation markers
filtered <- tibble()
for(i in 1:nrow(data)) {
if(data[i, "discourse"] == 1 & data[i+1, "discourse"] == 1 &
# only filter if in same utterance
data[i, "cu_id"] == data[i+1, "cu_id"]){}
else{filtered <- filtered %>% rbind(data[i,])}
}

for(i in 1:nrow(filtered)) {
filtered[i, "Position"] <- ifelse(
test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i-1, "cu_id"]),
yes = "Initial", no = ifelse(
test = as.character(filtered[i, "cu_id"]) != as.character(filtered[i+1, "cu_id"]),
yes = "Final", no = "Medial"))
}

nums <- data %>% filter(discourse != 1) %>% group_by(cu_id) %>% mutate(utt_length = n())
nums <- unique(nums[c("cu_id", "utt_length")])

final_data <- filtered %>%
left_join(nums, by = "cu_id") %>%
mutate(utt_length = ifelse(is.na(utt_length), 0, utt_length)) %>%
mutate(z_uttLength = (utt_length - mean(utt_length)) / sd(utt_length))

write_csv(final_data, 'data/data_preprocessed_dis.csv')
44 changes: 44 additions & 0 deletions scripts/02_dataExploration.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
library(tidyverse)
library(ggdist)
library(gghalves)
library(viridis)
library(xtable)

data <- read_csv('data/data_preprocessed_dis.csv')

data %>% filter(discourse==1) %>% group_by(tok) %>% count()

data %>% group_by(utt_length, Group) %>% count(hesitation) %>% filter(hesitation == 1) %>%
ggplot(aes(x = utt_length, y = n, color = Group)) +
geom_point() +
scale_color_viridis(discrete = TRUE, end = 0.7)

data %>% ggplot(aes(x = Group, y = utt_length, color = Group, fill = Group)) +
geom_boxplot(width = .2, fill = "white", size = 1, outlier.shape = NA) +
geom_half_point(side = "l", range_scale = .25, alpha = .5, size = 0.1) +
stat_halfeye(adjust = 1, width = .5, color = NA, position = position_nudge(x = .15)) +
coord_flip() +
scale_fill_viridis(discrete = TRUE, end = 0.7) +
scale_color_viridis(discrete = TRUE, end = 0.7)

## dis
data_grouped_dis <- data %>% group_by(Group, Position, Register, discourse) %>% count() %>%
pivot_wider(names_from = Position, values_from = n) %>%
mutate(discourse = as.character(discourse),
total = Initial + Medial + Final) %>%
arrange(discourse) %>%
relocate(discourse, Group, Register,Initial, Medial, Final, total)

count_dis <- data_grouped_dis %>%
filter(discourse == 0)
rel_count <- colSums(count_dis[, c(4:7)]) %>% as_tibble()


print(xtable(data_grouped_dis), include.rownames=FALSE)

## hes
data_grouped_hes <- data %>% group_by(Group, Position, hesitation) %>% count() %>%
mutate(hesitation = as.character(hesitation)) %>%
pivot_wider(names_from = Position, values_from = n)

print(xtable(data_grouped_hes), include.rownames=FALSE)
74 changes: 74 additions & 0 deletions scripts/03_priors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
library(tidyverse)
library(ggdist)
#library(patchwork)
library(viridis)

n = 1e5
set.seed(42)


intercepts <- tibble(x = c(rnorm(n, -2.3, 0.3))) %>%
mutate(group = 'alpha%~% Normal(-3, 1)') %>%
ggplot(aes(x = x)) +
geom_density(aes(fill = group)) +
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
scale_y_continuous(breaks = NULL) +
ylab("Density of values") +
scale_x_continuous(name = "Intercept values on log-scale",
#limits = c(-1.25, 1.25),
breaks = seq(from = -6, to = 1, by = 0.5)) +
theme(legend.position = "none",
plot.title = element_text(size = 14)) +
labs(title = "β ~ Normal(-3, 1)")

#########################################
### influence of predictors ###
#########################################
predictors <- tibble(x = c(rnorm(n, 0, 2))) %>%
mutate(group = 'beta%~% Normal(0, 0.3)') %>%
ggplot(aes(x = x)) +
geom_density(aes(fill = group)) +
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
scale_y_continuous(breaks = NULL) +
ylab("Density of values") +
scale_x_continuous(name = "Predictor values on log-scale",
#limits = c(-5, 5),
breaks = seq(from = -5, to = 5, by = 0.5)) +
theme(legend.position = "none",
plot.title = element_text(size = 14)) +
labs(title = "β ~ Normal(0, 2)")

# exp -5 = -0.006
# exp -2 = 0.135s
# exp 0 = 1

sigma <- rexp(n, rate = 10) %>%
tibble() %>%
mutate(group = 'sigma%~% exp(10)') %>%
ggplot(aes(x=.)) +
geom_density(aes(fill = group)) +
scale_y_continuous(breaks = NULL,
name = "Density of values") +
scale_x_continuous(breaks = seq(from = 0, to = 1.2, by = 0.2),
#limits = c(0, 2),
name = "Standard deviation of varying intercepts on log-scale") +
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
theme(legend.position = "none",
plot.title = element_text(size = 14)) +
labs(title = "σ ~ Exp(8)")

#########################################
### varying slopes matrix ###
#########################################
lkjcorr <- rlkjcorr_marginal(n, K = 2, eta = 12) %>% tibble(x = .) %>%
mutate(group = 'R%~% LKJcorr(12)') %>%
ggplot(aes(x = x, fill = group)) +
geom_density() +
scale_y_continuous(breaks = NULL) +
scale_x_continuous(name = "Correlation of varying intercepts and slopes",
breaks = c(-1, -0.5, 0, 0.5, 1)) +
scale_fill_viridis(discrete = T, alpha = 0.7, end = 0.7) +
theme(legend.position = "none",
plot.title = element_text(size = 14)) +
ylab("Density of values") +
labs(title = "R ~ LKJcorr(12)")
50 changes: 50 additions & 0 deletions scripts/04_priorPredictions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
library(tidyverse)
library(bayesplot)
library(brms)
library(patchwork)
library(tidybayes)
library(viridis)

data <- read_csv('data/data_preprocessed_dis.csv') %>% mutate(Position = as.factor(Position))

model_prior <-
brm(data = data, family = bernoulli,
formula = discourse ~ 0 + Position + Register + z_uttLength +
(0 + Position | Group/Speaker),
prior = c(prior(normal(0, 2), class = b),
prior(exponential(10), class = sd),
prior(lkj(12), class = cor)),
iter = 5000, warmup = 2000, chains = 4, cores = 4,
control = list(adapt_delta = 0.98),
sample_prior = "only",
file = "models/grouped_samples.rds",
seed = 42)

color_scheme_set("pink")

para_vals <- posterior_summary(model_prior) %>%
data.frame() %>% as_tibble(rownames = "parameter")

hpdi_vals <- posterior_interval(model_prior, prob=0.89) %>%
data.frame() %>% as_tibble(rownames = "parameter") %>%
rename(hpdi_low = X5.5., hpdi_high = X94.5.)

para_vals <- para_vals %>% left_join(hpdi_vals)


raw_markers <- data %>% .$discourse

if (file.exists("data/models/prior_pred.rds")) {
prior_markers <- readRDS(file = "models/prior_pred.rds")
} else{
prior_markers <- posterior_predict(model_prior, ndraws = 200,
cores = getOption("mc.cores", 4))
saveRDS(prior_markers, file = "models/prior_pred.rds")
}

prior_overlay <- ppc_dens_overlay(raw_markers, prior_markers,
alpha = 0.5, size = 0.7, adjust = 1)
prior_overlay$scales$scales[[1]]$labels <- c("data", "prior")

ggsave("images/prior_simData.png", prior_overlay, scale = 1.1,
width = 2000, height = 1400, units = "px")
18 changes: 18 additions & 0 deletions scripts/05_model.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
library(tidyverse)
library(brms)

data <- read_csv('data/data_preprocessed_dis.csv') %>%
mutate(Position=as.factor(Position))

model_grouped <- brm(
data=data, family=bernoulli,
formula=discourse ~ 0 + Position + Register + z_uttLength +
(0 + Position | Group/Speaker),
prior=c(prior(normal(0, 2), class=b),
prior(exponential(10), class=sd),
prior(lkj(12), class=cor)),
iter=20000, warmup=4000, chains=8, cores=8,
control=list(adapt_delta=0.98),
file="models/dis_grouped.rds",
seed=42
)

0 comments on commit f9a1f36

Please sign in to comment.