Skip to content

Commit

Permalink
fix documentation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
nakib103 committed May 16, 2023
1 parent 3338125 commit 5ce9609
Show file tree
Hide file tree
Showing 13 changed files with 712 additions and 546 deletions.
66 changes: 66 additions & 0 deletions scripts/docs/data_files/vcf_species_list_109.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"sander_lucioperca": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 975,000 variants\">975 K</span>",
"name": "Pike-perch",
"label": "Sander lucioperca"
},
"ficedula_albicollis": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 37,000 variants\">37 K</span>",
"name": "Collared flycatcher",
"label": "Ficedula albicollis"
},
"macaca_mulatta": {
"count": "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 50 million variants\">50 M</span>",
"name": "Macaque",
"label": "Macaca mulatta"
},
"neovison_vison": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 34,000 variants\">34 K</span>",
"name": "American mink",
"label": "Neovison vison"
},
"salmo_salar": {
"count": "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 8 million variants\">8 M</span>",
"name": "Atlantic salmon",
"label": "Salmo salar",
"genotype": 1
},
"microtus_ochrogaster": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 8,000 variants\">8 K</span>",
"name": "Prairie vole",
"label": "Microtus ochrogaster"
},
"coturnix_japonica": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 8,000 variants\">8 K</span>",
"name": "Japanese quail",
"label": "Coturnix japonica"
},
"chlorocebus_sabaeus": {
"count": "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 62 million variants\">62 M</span>",
"name": "Vervet-AGM",
"label": "Chlorocebus sabaeus",
"genotype": 1
},
"macaca_fascicularis": {
"count": "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 1.6 million variants\">1.6 M</span>",
"name": "Crab-eating macaque",
"label": "Macaca fascicularis"
},
"oreochromis_niloticus": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 63,000 variants\">63 K</span>",
"name": "Nile tilapia",
"label": "Oreochromis niloticus"
},
"oryctolagus_cuniculus": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 82,000 variants\">82 K</span>",
"name": "Rabbit",
"label": "Oryctolagus cuniculus",
"genotype": 1
},
"parus_major": {
"count": "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 497,000 variants\">497 K</span>",
"name": "Great Tit",
"label": "Parus major",
"genotype": 1
}
}
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ =head1 CONTACT
}
$html_content .= qq{ <tr$bg>$icon_col<td>$cs_term</td>$examples</tr>\n};
$bg = set_bg();
print STDERR qq{Term "$cs_term" done ($count/$cs_term_count)\n};
print STDERR "Term "$cs_term" done ($count/$cs_term_count)\n";
}

# Four-star rating
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
###############################################################
########## CONFIGURE #######
###############################################################
my ($e_version,$html_file,$hlist,$user,$help);
my ($e_version,$html_file,$hlist,$user,$help,$d_dir);
## EG options
my ($site, $etype);

Expand All @@ -37,31 +37,39 @@
'o=s' => \$html_file,
'help!' => \$help,
'hlist=s' => \$hlist,
'user=s' => \$user
'user=s' => \$user,
'd_dir=s' => \$d_dir
);

## Missing arguments ##
if (!$e_version) {
print "> Error! Please give an Ensembl version, using the option '-v' \n";
print STDERR "> Error! Please give an Ensembl version, using the option '-v' \n";
usage();
}
if (!$html_file) {
print "> Error! Please give an output file using the option '-o'\n";
print STDERR "> Error! Please give an output file using the option '-o'\n";
usage();
}
if (!$hlist) {
print "> Error! Please give the list of host names where the new databases are stored using the option '-hlist'\n";
print STDERR "> Error! Please give the list of host names where the new databases are stored using the option '-hlist'\n";
usage();
}
if (!$user) {
print "> Error! Please give user name using the option '-user'\n";
print STDERR "> Error! Please give user name using the option '-user'\n";
usage();
}
usage() if ($help);

# Get the dir this script is residing in
my $dirname = dirname(__FILE__);

# Get the local dir where the vcf files are located
my $data_dir = "/nfs/production/flicek/ensembl/production/ensemblftp/data_files/vertebrates";

if ($d_dir){
$data_dir = $d_dir;
}

my $vcf_config_file = $dirname . '/../../modules/Bio/EnsEMBL/Variation/DBSQL/vcf_config.json';

# read config from JSON config file
Expand Down Expand Up @@ -140,9 +148,9 @@
# loop over databases
while (my ($dbname) = $sth->fetchrow_array) {
next if ($dbname !~ /^[a-z][a-z_]*_[a-z]+_$db_type\_$e_version\_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);

print "$dbname\n";
print STDERR "$dbname\n";
$dbname =~ /^(.+)_$db_type/;
my $s_name = $1;

Expand Down Expand Up @@ -309,6 +317,70 @@
########## FUNCTIONS ##########
###############################################################


# Get a random file from filename template in vcf collection
sub get_random_file {
my ($project) = @_;
my $file;

my $filename_template = $project->{filename_template};

if ($filename_template =~ /###CHR###/){
my $chromosomes = $project->{chromosomes};

return undef unless $chromosomes;

my $chr = @{ $chromosomes }[0];

$file = $filename_template =~ s/###CHR###/$chr/gr;
}
else{
$file = $filename_template
}

return $file;
}

# Determine what type data contains in the vcf file
sub get_vcf_content_types {
my ($project) = @_;
my @types;

# this ignores the false positive sigpipe error from tabix command
$SIG{PIPE} = 'DEFAULT';

# add if the vcf collection mentions annotation type
push @types, $project->{annotation_type} if $project->{annotation_type};

# if use_as_source is set then it is the main source for tracks
push @types, "source" if $project->{use_as_source};

# check FORMAT field of the vcf file to see if it has genotype
my $file = get_random_file($project);

my $file_full_path = $file;
if ($project->{type} eq "local"){
$file_full_path = $data_dir . $file_full_path;
}

my $genotypes = `tabix -D $file_full_path -H | grep '##FORMAT' | grep 'ID=GT'`;
push @types, "genotype" if $genotypes;

# check in a actual line for FORMAT field if not exist in header
unless ($genotypes){
my $chr = `tabix -D $file_full_path -l | head -n 1`;
chop $chr;

my $line = `tabix -D $file_full_path $chr | head -n 1`;

my $format_field = (split /\t/, $line)[8];

push @types, "genotype" if $format_field;
}

return @types;
}

# Build the project populations structure if it exists
sub get_population_structure {
my $pops = shift;
Expand Down Expand Up @@ -424,6 +496,11 @@ sub get_sub_populations {
sub get_project_populations {

foreach my $project (@{$vcf_config->{'collections'}}) {

# Check if the file have genotype data and being showed
my @types = get_vcf_content_types($project);
next unless grep /^genotype$/, @types;

my $project_id = $project->{'id'};
next if ($project->{'assembly'} =~ /GRCh37/i || $project->{'annotation_type'} eq 'cadd' || $project->{'annotation_type'} eq 'gerp');

Expand Down Expand Up @@ -607,6 +684,8 @@ sub usage {
-hlist The list of host names (with port) where the new databases are stored, separated by a coma,
e.g. ensembldb.ensembl.org1:1234, ensembldb.ensembl.org2:1234 (Required)
-user MySQL user name (Required)
-d_dir The directory location of where the local vcf files are stored (optional). By default it looks in -
/nfs/production/flicek/ensembl/production/ensemblftp/data_files/vertebrates
} . "\n";
exit(0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ =head1 CONTACT
# loop over databases
while (my ($dbname) = $sth->fetchrow_array) {
next if ($dbname !~ /^[a-z][a-z_]*_[a-z]+_$db_type\_$db_version\_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);

print $dbname;
$dbname =~ /^(.+)_$db_type/;
Expand Down Expand Up @@ -170,7 +170,7 @@ =head1 CONTACT
}

}
print " ... done\n";
print STDERR " ... done\n";
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ =head1 CONTACT
next if ($dbname !~ /^[a-z][a-z_]*_[a-z]+_variation_\d+_\d+$/i);
next if ($dbname =~ /^master_schema/ || $dbname =~ /^homo_sapiens_variation_\d+_37$/ || $dbname =~ /private/);
$db_found ++;
print STDERR $dbname;
print $dbname;
$dbname =~ /^(.+)_variation/;
my $s_name = $1;

Expand Down
9 changes: 6 additions & 3 deletions scripts/misc/sources2html.pl → scripts/docs/sources2html.pl
Original file line number Diff line number Diff line change
Expand Up @@ -264,10 +264,10 @@ =head1 CONTACT
# loop over databases
while (my ($dbname) = $sth->fetchrow_array) {
next if ($dbname !~ /^[a-z][a-z_]*_[a-z0-9]+_variation_\d+_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces)/ || $dbname =~ /^homo_sapiens_variation_\d+_37$/ || $dbname =~ /private/);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_variation_\d+_37$/ || $dbname =~ /private/);

$db_found ++;
print STDERR $dbname;
print STDERR "${dbname}\n";
$dbname =~ /^(.+)_variation/;
my $s_name = $1;

Expand Down Expand Up @@ -472,7 +472,7 @@ sub source_table {
foreach my $project (@{ $vcf_config->{'collections'} }) {
next if $project->{annotation_type} eq 'cadd' || $project->{annotation_type} eq 'gerp';

if ($project->{species} eq $name) {
if (lc( $project->{species} ) eq $name) {
my ($source, $version, $description, $info, $count, $example_url);

# determine type of data the file has
Expand Down Expand Up @@ -1297,6 +1297,9 @@ sub get_example {
sub get_vcf_content_types {
my ($project) = @_;
my @types;

# this ignores the false positive sigpipe error from tabix command
$SIG{PIPE} = 'DEFAULT';

# add if the vcf collection mentions annotation type
push @types, $project->{annotation_type} if $project->{annotation_type};
Expand Down
20 changes: 10 additions & 10 deletions scripts/misc/species_list.pl → scripts/docs/species_list.pl
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,19 @@ =head1 CONTACT
);

if (!$e_version) {
print "> Error! Please give an Ensembl version, using the option '-v' \n";
print STDERR "> Error! Please give an Ensembl version, using the option '-v' \n";
usage();
}
if (!$html_file) {
print "> Error! Please give an output file using the option '-o'\n";
print STDERR "> Error! Please give an output file using the option '-o'\n";
usage();
}
if (!$hlist) {
print "> Error! Please give the list of host names where the new databases are stored using the option '-hlist'\n";
print STDERR "> Error! Please give the list of host names where the new databases are stored using the option '-hlist'\n";
usage();
}
if (!$user) {
print "> Error! Please give user name using the option '-user'\n";
print STDERR "> Error! Please give user name using the option '-user'\n";
usage();
}

Expand Down Expand Up @@ -122,21 +122,21 @@ =head1 CONTACT
# loop over databases
while (my ($dbname) = $sth->fetchrow_array) {
next if ($dbname !~ /^[a-z][a-z_]*_[a-z0-9]+_$db_type\_$e_version\_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);

print $dbname;
print STDERR "${dbname}\n";
$dbname =~ /^(.+)_$db_type/;
my $s_name = $1;

if ($etype) { # EG site - need to filter out species
my $img_thumb = sprintf qq{eg-plugins/%s/htdocs/img/species/thumb_%s.png}, $etype, ucfirst($s_name);
# print "- checking for $img_thumb ... ";
# print STDERR "- checking for $img_thumb ... ";
if (! -e $img_thumb) {
print "\t... skipping \n";
print STDERR "\t... skipping \n";
next;
}
}
}
print "\n";
print STDERR "\n";

# Count the number of variations
my $sth2 = get_connection_and_query($dbname, $hostname, $sql2);
Expand Down

0 comments on commit 5ce9609

Please sign in to comment.