Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update billion count icon in docs and other updates #1047

Merged
merged 9 commits into from
Nov 3, 2023
24 changes: 24 additions & 0 deletions modules/Bio/EnsEMBL/Variation/DBSQL/vcf_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,30 @@
"source_name": "EVA",
"source_version": 4
},
{
"id": "eva_nomascus_leucogenys_gca000146795v3",
"description": "Variants from EVA",
"species": "nomascus_leucogenys",
nuno-agostinho marked this conversation as resolved.
Show resolved Hide resolved
"assembly": "Nleu_3.0",
"type": "remote",
"filename_template": "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_4/by_species/nomascus_leucogenys/GCA_000146795.3/GCA_000146795.3_current_ids.vcf.gz",
"use_as_source": 1,
"use_seq_region_synonyms": 0,
"source_name": "EVA",
"source_version": 4
},
{
"id": "eva_pan_troglodytes_gca000001515v5",
"description": "Variants from EVA",
"species": "Pan_troglodytes",
"assembly": "Pan_tro_3.0",
"type": "remote",
"filename_template": "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_4/by_species/pan_troglodytes/GCA_000001515.5/GCA_000001515.5_current_ids.vcf.gz",
"use_as_source": 1,
"use_seq_region_synonyms": 1,
"source_name": "EVA",
"source_version": 4
},
{
"id": "eva_ficedula_albicollis_gca000247815v2",
"description": "Variants from EVA",
Expand Down
116 changes: 116 additions & 0 deletions scripts/docs/data_files/vcf_species_list_111.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"callithrix_jacchus" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 14 million variants\">14 M</span>",
"name" : "White-tufted-ear marmoset",
"label" : "Callithrix jacchus"
},
"sander_lucioperca" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 975,000 variants\">975 K</span>",
"name" : "Pike-perch",
"label" : "Sander lucioperca"
},
"ficedula_albicollis" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 37,000 variants\">37 K</span>",
"name" : "Collared flycatcher",
"label" : "Ficedula albicollis"
},
"macaca_mulatta" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 50 million variants\">50 M</span>",
"name" : "Macaque",
"label" : "Macaca mulatta"
},
"pan_troglodytes" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 22 million variants\">22 M</span>",
"name" : "Chimpanzee",
"label" : "Pan troglodytes"
},
"coturnix_japonica" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 8,000 variants\">8 K</span>",
"name" : "Japanese quail",
"label" : "Coturnix japonica"
},
"bos_grunniens" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 64,000 variants\">64 K</span>",
"name" : "Domestic yak",
"label" : "Bos grunniens"
},
"anas_platyrhynchos" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 30 million variants\">30 M</span>",
"name" : "Mallard",
"label" : "Anas platyrhynchos"
},
"macaca_fascicularis" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 1.6 million variants\">1.6 M</span>",
"name" : "Crab-eating macaque",
"label" : "Macaca fascicularis"
},
"oreochromis_niloticus" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 63,000 variants\">63 K</span>",
"name" : "Nile tilapia",
"label" : "Oreochromis niloticus"
},
"oncorhynchus_mykiss" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 33,000 variants\">33 K</span>",
"name" : "Rainbow trout",
"label" : "Oncorhynchus mykiss"
},
"taeniopygia_guttata" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 1.5 million variants\">1.5 M</span>",
"name" : "Zebra finch",
"label" : "Taeniopygia guttata"
},
"meleagris_gallopavo" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 5,000 variants\">5 K</span>",
"name" : "Turkey",
"label" : "Meleagris gallopavo"
},
"seriola_dumerili" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 9 million variants\">9 M</span>",
"name" : "Greater amberjack",
"label" : "Seriola dumerili"
},
"salmo_salar" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 8 million variants\">8 M</span>",
"name" : "Atlantic salmon",
"label" : "Salmo salar",
"genotype" : 1
},
"neovison_vison" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 34,000 variants\">34 K</span>",
"name" : "American mink",
"label" : "Neovison vison"
},
"microtus_ochrogaster" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 8,000 variants\">8 K</span>",
"name" : "Prairie vole",
"label" : "Microtus ochrogaster"
},
"pongo_abelii" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 9.6 million variants\">9.6 M</span>",
"name" : "Sumatran orangutan",
"label" : "Pongo abelii"
},
"nomascus_leucogenys" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_2\" title=\"Over 1.1 million variants\">1.1 M</span>",
"name" : "Gibbon",
"label" : "Nomascus leucogenys"
},
"chlorocebus_sabaeus" : {
"count" : "<span class=\"vdoc_var_count vdoc_million_1\" title=\"Over 67 million variants\">67 M</span>",
"name" : "Vervet-AGM",
"label" : "Chlorocebus sabaeus",
"genotype" : 1
},
"parus_major" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 497,000 variants\">497 K</span>",
"name" : "Great Tit",
"genotype" : 1,
"label" : "Parus major"
},
"oryctolagus_cuniculus" : {
"count" : "<span class=\"vdoc_var_count vdoc_thousand\" title=\"Over 83,000 variants\">83 K</span>",
"name" : "Rabbit",
"label" : "Oryctolagus cuniculus",
"genotype" : 1
}
}
2 changes: 1 addition & 1 deletion scripts/docs/generate_clin_significance_tables.pl
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ sub sort_clin_sign_terms {
}
$html_content .= qq{ <tr$bg>$icon_col<td>$cs_term</td>$examples</tr>\n};
$bg = set_bg();
print STDERR "Term "$cs_term" done ($count/$cs_term_count)\n";
print STDERR "Term '$cs_term' done ($count/$cs_term_count)\n";
}

# Four-star rating
Expand Down
14 changes: 10 additions & 4 deletions scripts/docs/generate_population_table.pl
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@

# Loop over the species (placing human first)
foreach my $species (sort { ($a !~ /Homo/ cmp $b !~ /Homo/) || $a cmp $b } keys(%pops_list)) {

next unless %{ $pops_list{$species} };

my $id_species = $species;
Expand Down Expand Up @@ -374,8 +373,16 @@ sub get_vcf_content_types {
my $line = `tabix -D $file_full_path $chr | head -n 1`;

my $format_field = (split /\t/, $line)[8];

push @types, "genotype" if $format_field;

my $info_field = (split /\t/, $line)[7];
if ( ($info_field =~ /AF=/) || ($info_field =~ /AC=/ && $info_field =~ /AN=/) ) {
push @types, "frequency";
}
# a hard-coded check for NCBI-ALPHA and TOPMED as they have very special field for frequency
if ( ($info_field =~ /AN_SAMN/) || ($info_field =~ /TOPMED=/) ) {
push @types, "frequency";
}
}

return @types;
Expand Down Expand Up @@ -496,10 +503,9 @@ sub get_sub_populations {
sub get_project_populations {

foreach my $project (@{$vcf_config->{'collections'}}) {

# Check if the file have genotype data and being showed
my @types = get_vcf_content_types($project);
next unless grep /^genotype$/, @types;
next unless ( grep(/^genotype$/, @types) || grep(/^frequency$/, @types) );

my $project_id = $project->{'id'};
next if ($project->{'assembly'} =~ /GRCh37/i || $project->{'annotation_type'} eq 'cadd' || $project->{'annotation_type'} eq 'gerp');
Expand Down
2 changes: 1 addition & 1 deletion scripts/docs/generate_variation_set_table.pl
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ =head1 CONTACT
next if ($dbname !~ /^[a-z][a-z_]*_[a-z]+_$db_type\_$db_version\_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_$db_type\_\d+_37$/ || $dbname =~ /private/);

print $dbname;
print STDERR $dbname;
$dbname =~ /^(.+)_$db_type/;
my $s_name = $1;

Expand Down
71 changes: 39 additions & 32 deletions scripts/docs/sources2html.pl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ =head1 CONTACT
);
my %colour_class = ( 'version' => 'vdoc_new_version',
'source' => 'vdoc_new_source',
'few_billion' => 'vdoc_billion_2',
'hundred_million' => 'vdoc_million_0',
'lot_million' => 'vdoc_million_1',
'few_million' => 'vdoc_million_2',
Expand Down Expand Up @@ -265,7 +266,7 @@ =head1 CONTACT
while (my ($dbname) = $sth->fetchrow_array) {
next if ($dbname !~ /^[a-z][a-z_]*_[a-z0-9]+_variation_\d+_\d+$/i);
next if ($dbname =~ /^(master_schema|drosophila|saccharomyces|ciona)/ || $dbname =~ /^homo_sapiens_variation_\d+_37$/ || $dbname =~ /private/);

$db_found ++;
print STDERR "${dbname}\n";
$dbname =~ /^(.+)_variation/;
Expand Down Expand Up @@ -473,16 +474,17 @@ sub source_table {
next if $project->{annotation_type} eq 'cadd' || $project->{annotation_type} eq 'gerp';

if (lc( $project->{species} ) eq $name) {
my ($source, $version, $description, $info, $count, $example_url);
my ($source, $source_url, $version, $description, $info, $count, $example_url);

# determine type of data the file has
my @types = get_vcf_content_types($project);

my $source_name = $project->{source} ? $project->{source} : 'EVA';
my $source_url = $eva_url;
my $source_name = $project->{source_name} ? $project->{source_name} : 'EVA';

# Assuming only one config will have use_as_source set per species
if ( grep /^source$/, @types){
if ( grep(/^source$/, @types) && $source_name eq "EVA" ){
$source_url = $eva_url;

# Get the version from filename template
my $filename_template = $project->{filename_template};
my @eva_release = grep {/release_/} (split /\//, $filename_template);
Expand All @@ -495,30 +497,17 @@ sub source_table {
$version = "-";
}

# Set description
$description = "Variants imported from EVA";
}

# Assuming only one config will have use_as_source set per species
if ( grep /^genotype$/, @types){
# Update source name and url to study id if possible
if ($source_name =~ /^(?!PRJ)/){
# Try getting the study id from database if not in vcf collection
my $sth3 = get_connection_and_query($db_name, $hostname, $sql_display_group);
nuno-agostinho marked this conversation as resolved.
Show resolved Hide resolved
my $source_name_from_db = $sth3->fetchrow_array;

if ($source_name_from_db) {
$source_name = $source_name_from_db;

$source_url = $eva_study_url;
$source_url =~ s/###ID###/$source_name/g;
}
}
# Assuming the EVA release VCF file will not have genotypes
if ( (grep(/^genotype$/, @types) || grep(/^frequency$/, @types)) && $source_name =~ /^(PRJ)/ ){
# Set source url using study id
$source_url = $eva_study_url;
$source_url =~ s/###ID###/$source_name/g;

# Get the version from filename template
$version = "-";

# Set description
$description = "Variants with genotypes imported from EVA";
}

Expand Down Expand Up @@ -885,6 +874,7 @@ sub create_menu {
}
my $v_colour = $colour_class{'version'};
my $s_colour = $colour_class{'source'};
my $fb_colour = $colour_class{'few_billion'};
my $hm_colour = $colour_class{'hundred_million'};
my $lm_colour = $colour_class{'lot_million'};
my $fm_colour = $colour_class{'few_million'};
Expand Down Expand Up @@ -954,11 +944,17 @@ sub create_menu {
<div style="border-top:1px dotted #336;margin-top:2px;padding:4px 0px 0px">
<span style="padding-left:4px;font-weight:bold">Data types - entries count:</span>
<table>
<tr>
<td style="padding-top:4px;text-align:center">
<span class="vdoc_count_legend $fb_colour"></span>
</td>
<td style="padding-top:4px">greater than 1 billion</td>
</tr>
<tr>
<td style="padding-top:4px;text-align:center">
<span class="vdoc_count_legend $hm_colour"></span>
</td>
<td style="padding-top:4px">greater than 100 million</td>
<td style="padding-top:4px">from 100 million to 999.9 million</td>
</tr>
<tr>
<td style="padding-top:4px;text-align:center">
Expand Down Expand Up @@ -1232,14 +1228,22 @@ sub get_count {
$count_display = $count;
$bg_class = $colour_class{'lot_million'};
}
# From 100 million
elsif ($count =~ /^(\d{3}\d*)\d{6}$/) {
# From 100 million to 999.9 million
elsif ($count =~ /^(\d{3})\d{6}$/) {
my $number = $1;
$count = "$number M";
$count_label = "Over $number million $end_label";
$count_display = $count;
$bg_class = $colour_class{'hundred_million'};
}
# From 1 billion to 9.9 billion
elsif ($count =~ /^(\d+)(\d)\d{8}$/) {
my $number = ($2!=0) ? "$1.$2" : $1;
$count = "$number B";
$count_label = "Over $number billion $end_label";
$count_display = $count;
$bg_class = $colour_class{'few_billion'};
}
# From 1,000 to 999,999
elsif ($count =~ /^(\d+)\d{3}$/) {
my $number = $1;
Expand Down Expand Up @@ -1318,17 +1322,20 @@ sub get_vcf_content_types {
my $genotypes = `tabix $file_full_path -H | grep '##FORMAT' | grep 'ID=GT'`;
push @types, "genotype" if $genotypes;

my $chr = `tabix $file_full_path -l | head -n 1`;
chomp $chr;
my $line = `tabix $file_full_path $chr | head -n 1`;

# check in a actual line for FORMAT field if not exist in header
unless ($genotypes){
my $chr = `tabix $file_full_path -l | head -n 1`;
chop $chr;

my $line = `tabix $file_full_path $chr | head -n 1`;

my $format_field = (split /\t/, $line)[8];

push @types, "genotype" if $format_field;
}

my $info_field = (split /\t/, $line)[7];
if ( ($info_field =~ /AF=/) || ($info_field =~ /AC=/ && $info_field =~ /AN=/) ) {
push @types, "frequency";
}

return @types;
}
Expand Down
15 changes: 12 additions & 3 deletions scripts/docs/species_list.pl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ =head1 CONTACT

my $html;

my %colours = ( 'hundred_million' => { 'order' => 5, 'colour' => 'vdoc_million_0', 'legend' => 'From 100 million'},
my %colours = ( 'few_billion' => { 'order' => 6, 'colour' => 'vdoc_billion_2', 'legend' => 'Over 1 billion'},
'hundred_million' => { 'order' => 5, 'colour' => 'vdoc_million_0', 'legend' => 'From 100 million to 999,9 million'},
'lot_million' => { 'order' => 4, 'colour' => 'vdoc_million_1', 'legend' => 'From 10 million to 99,9 million'},
'few_million' => { 'order' => 3, 'colour' => 'vdoc_million_2', 'legend' => 'From 1 million to 9,9 million'},
'thousand' => { 'order' => 2, 'colour' => 'vdoc_thousand', 'legend' => 'From 1,000 to 999,999'},
Expand Down Expand Up @@ -360,14 +361,22 @@ sub round_count {
$count_display = $count;
$bg_class = $colours{'lot_million'}{'colour'};
}
# From 100 million
elsif ($count =~ /^(\d{3}\d*)\d{6}$/) {
# From 100 million to 999.9 million
elsif ($count =~ /^(\d{3})\d{6}$/) {
my $number = $1;
$count = "$number M";
$count_label = "Over $number million $type";
$count_display = $count;
$bg_class = $colours{'hundred_million'}{'colour'};
}
# From 1 to 9.9 billion
elsif ($count =~ /^(\d+)(\d)\d{8}$/) {
my $number = ($2!=0) ? "$1.$2" : $1;
$count = "$number B";
$count_label = "Over $number billion $type";
$count_display = $count;
$bg_class = $colours{'few_billion'}{'colour'};
}
# From 1,000 to 999,999
elsif ($count =~ /^(\d+)\d{3}$/) {
my $number = $1;
Expand Down