/
Main.R
128 lines (89 loc) · 4.69 KB
/
Main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
source('./functions.R')
options(scipen=999)
# GLOBAL atguments
samples_2x_File="test_dataset/inputData/2xSamples"
samples_4x_File="test_dataset/inputData/4xSamples"
Parent_A="amara"
between_homeolog_distance = 4
use_between_parents_distance=FALSE
between_parents_distance = 2
# read and parse parents, samples, and patterns
pp = read.delim(samples_2x_File, header = F, sep = ":", row.names = 1, stringsAsFactors = FALSE)
samples2x=list()
for (p in 1:length(row.names(pp))) {
if (length(strsplit(pp$V2[p], split = ",")[[1]]) == 1) {
eval( parse (text= paste("samples2x$", row.names(pp)[p], " = c(\"", pp$V2[p], "\")",sep = "" )))
next
}
eval( parse (text= paste("samples2x$", row.names(pp)[p], " = ", paste(strsplit(pp$V2[p], split = ","), sep = "\", \""), sep = "") ))
}
rm (pp)
samples4x = read.delim(samples_4x_File, sep = ",", header = F, stringsAsFactors = FALSE)
# eval(parse( text = paste("parent_A = samples2x$",Parent_A, sep = "")))
library(ape)
library(seqinr)
library(filelock)
library(parallel)
#
# sortAllels
#
# set local atguments
seqDir="test_dataset/inputData/seq"
treeDir="test_dataset/inputData/trees"
patterns_exonsFile="test_dataset/inputData/patterns_exons"
if_below_treshold = "mask" # "mask" or "remove"
ResDir="test_dataset/1_AB_Homeologs_exons_seq"
logfile = "test_dataset/1_AB_Homeologs_exons_seq/1_AB_Homeologs_exons_seq.log"
dir.create(ResDir)
patterns_exons = read.delim(patterns_exonsFile, header = FALSE, stringsAsFactors = FALSE)
cat("", samples4x$V1, sep = "\t", file = logfile)
# using multiple threads
mclapply(patterns_exons$V1, sortAllels, treeDir=treeDir, seqDir = seqDir, resdir=ResDir, samples2x=samples2x, samples4x=samples4x, Parent_A=Parent_A, if_below_treshold=if_below_treshold, between_homeolog_distance=between_homeolog_distance, logfile=logfile, mc.cores = 4)
# same in single thread
for (pattern in patterns_exons$V1) {
cat(pattern, "\n")
sortAllels(pattern, treeDir=treeDir, seqDir = seqDir, resdir=ResDir, samples2x=samples2x, samples4x=samples4x, Parent_A=Parent_A, if_below_treshold=if_below_treshold, between_homeolog_distance=between_homeolog_distance, logfile=logfile )
}
#
# Concatenation using AMAS (https://github.com/marekborowiec/AMAS)
# mkdir test_dataset/2_AB_Homeologs_genes_seq
# while read r; do AMAS.py concat -i test_dataset/1_AB_Homeologs_exons_seq/$r* -f fasta -d dna -t test_dataset/2_AB_Homeologs_genes_seq/$r.fasta; done < test_dataset/inputData/patterns_genes
#
#
# confirmSorting (!! you will need trees created form CONCATENATED genes)
#
# set local atguments
patterns_genesFile="test_dataset/inputData/patterns_genes"
seqDir="test_dataset/2_AB_Homeologs_genes_seq"
treeDir="test_dataset/3_AB_Homeologs_genes_trees"
if_below_treshold = "remove"
ResDir="test_dataset/4_AB_Homeologs_genes_seq_confirmed"
logfile = "test_dataset/4_AB_Homeologs_genes_seq_confirmed/4_AB_Homeologs_genes_seq_confirmed.log"
dir.create(ResDir)
patterns_genes = read.delim(patterns_genesFile, header = FALSE, stringsAsFactors = FALSE)
cat("", samples4x$V1, sep = "\t", file = logfile)
# using multiple threads
mclapply(patterns_genes$V1, confirmSorting, treeDir=treeDir, seqDir = seqDir, resdir=ResDir, samples2x=samples2x, samples4x=samples4x, Parent_A=Parent_A, if_below_treshold=if_below_treshold, between_homeolog_distance=between_homeolog_distance, logfile=logfile, mc.cores = 4)
# same in single thread
for (pattern in patterns_genes$V1) {
cat(pattern, "\n")
confirmSorting(pattern, treeDir=treeDir, seqDir = seqDir, resdir=ResDir, samples2x=samples2x, samples4x=samples4x, Parent_A=Parent_A, if_below_treshold=if_below_treshold, between_homeolog_distance=between_homeolog_distance, logfile=logfile )
}
#
# distanceTable (!! you will need trees created form CONFIRMED genes)
#
# set local atguments
patterns_genesFile="test_dataset/inputData/patterns_genes"
treeDir="test_dataset/5_AB_Homeologs_genes_seq_confirmed_trees"
tableFile = "test_dataset/table.txt"
patterns_genes = read.delim(patterns_genesFile, header = FALSE, stringsAsFactors = FALSE)
cat("\t", paste(rep(samples4x$V1, each = length(names(samples2x))*2), collapse = "\t"), "\n", file = tableFile)
cat("sequence\t", rep(paste(paste(names(samples2x), collapse = "_to_A1A2\t"), "_to_A1A2\t", paste(names(samples2x), collapse = "_to_B1B2\t"), "_to_B1B2\t", sep = ""), nrow(samples4x) ), file = tableFile, append = TRUE)
# using multiple threads
mclapply(patterns_genes$V1, distanceTable, treeDir=treeDir, samples2x=samples2x, samples4x=samples4x, logfile=tableFile, mc.cores = 4)
# same in single thread
for (pattern in patterns_genes$V1) {
cat(pattern, "\n")
distanceTable(pattern, treeDir=treeDir, samples2x=samples2x, samples4x=samples4x, logfile=tableFile )
}
# END -------------------------------