In [6]:
import json
import pandas as pd 


failed_targets = (
    pd.read_csv('failed_targets.tsv', sep='\t')
    .drop('unnamed', axis=1)
    .loc[ lambda x: x['datasourceIds'].apply(lambda y: 'ot_genetics_portal' in y)]
)
failed_targets.head()

Unnamed: 0,targetFromSourceId,datasourceIds,evidenceCount
2,ENSG00000269881,"[ot_genetics_portal, expression_atlas]",48
3,ENSG00000130489,"[ot_genetics_portal, expression_atlas]",45
4,ENSG00000254462,[ot_genetics_portal],43
5,ENSG00000283932,[ot_genetics_portal],42
6,ENSG00000278272,"[ot_genetics_portal, expression_atlas]",36


In [11]:
print(f'Number of targets: {len(failed_targets)}')
print(f'Number of failing evidence: {failed_targets.evidenceCount.sum()}')
print('List of gene ids: \n' + '\n'.join(failed_targets.targetFromSourceId.tolist()))

Number of targets: 39
Number of failing evidence: 551
List of gene ids: 
ENSG00000269881
ENSG00000130489
ENSG00000254462
ENSG00000283932
ENSG00000278272
ENSG00000270898
ENSG00000278500
ENSG00000261833
ENSG00000274267
ENSG00000204683
ENSG00000285258
ENSG00000267545
ENSG00000286169
ENSG00000263264
ENSG00000181013
ENSG00000243444
ENSG00000241978
ENSG00000268861
ENSG00000183729
ENSG00000260300
ENSG00000130723
ENSG00000286094
ENSG00000183791
ENSG00000284041
ENSG00000267697
ENSG00000213865
ENSG00000274897
ENSG00000285441
ENSG00000260869
ENSG00000277669
ENSG00000213029
ENSG00000281028
ENSG00000274744
ENSG00000262621
ENSG00000278674
ENSG00000272949
ENSG00000286261
ENSG00000277726
ENSG00000255863


The above list is submitted to Ensembl id mapping tool to get some information on id history: https://www.ensembl.org/Homo_sapiens/Tools/IDMapper

In [14]:
%%bash 

cat <(echo -e "Ensembl_gene_id\tstatus\tversion") \
    <(grep retired mapped_ids.txt | awk '{OFS="\t"}{print $1, $2, $3}')  \
    > retired_ids.tsv

In [42]:
retired_df = (
    pd.read_csv('retired_ids.tsv', sep='\t')
    .assign(
        gene_id= lambda x: x['Ensembl_gene_id'].str.split('.').apply(lambda z: z[0]),
        status= lambda x: x['status'].str.replace(r'[<>,]', ''),
        version= lambda x: x['version'].str.replace(',', '').astype(int)
    )
    .drop('Ensembl_gene_id', axis=1)
)
print(retired_df.head())
print(f'Number of unique gene ids: {len(retired_df.gene_id.unique())}')
print(f'Earliest release: {retired_df.version.min()}')
print('Releases when ids got retired:')
print(retired_df["version"].value_counts().sort_index())

    status  version          gene_id
0  retired       97  ENSG00000269881
1  retired       97  ENSG00000130489
2  retired       99  ENSG00000254462
3  retired       97  ENSG00000283932
4  retired       97  ENSG00000278272
Number of unique gene ids: 39
Earliest release: 97
Releases when ids got retired:
97     16
98      5
99      1
100     3
101     8
102     2
103     4
Name: version, dtype: int64


In [35]:
failed_targets.head()

Unnamed: 0,targetFromSourceId,datasourceIds,evidenceCount
2,ENSG00000269881,"[ot_genetics_portal, expression_atlas]",48
3,ENSG00000130489,"[ot_genetics_portal, expression_atlas]",45
4,ENSG00000254462,[ot_genetics_portal],43
5,ENSG00000283932,[ot_genetics_portal],42
6,ENSG00000278272,"[ot_genetics_portal, expression_atlas]",36


In [48]:
failed_w_status = (
    failed_targets
    .merge(retired_df, left_on='targetFromSourceId', right_on='gene_id', how='outer', indicator=True)
    .drop(['datasourceIds', 'gene_id', '_merge'], axis=1)
)

print(failed_w_status.to_markdown(index=False))

| targetFromSourceId   |   evidenceCount | status   |   version |
|:---------------------|----------------:|:---------|----------:|
| ENSG00000269881      |              48 | retired  |        97 |
| ENSG00000130489      |              45 | retired  |        97 |
| ENSG00000254462      |              43 | retired  |        99 |
| ENSG00000283932      |              42 | retired  |        97 |
| ENSG00000278272      |              36 | retired  |        97 |
| ENSG00000270898      |              34 | retired  |        97 |
| ENSG00000278500      |              32 | retired  |        97 |
| ENSG00000261833      |              23 | retired  |        97 |
| ENSG00000274267      |              22 | retired  |        97 |
| ENSG00000204683      |              20 | retired  |       101 |
| ENSG00000285258      |              20 | retired  |       100 |
| ENSG00000267545      |              20 | retired  |        97 |
| ENSG00000286169      |              18 | retired  |       102 |
| ENSG0000

So, the question is, where this data built in? Based on a quick check I identified a config for a snakefile where the input gene set is coded in the [V2G repo](https://github.com/opentargets/genetics-v2g-data/blob/539b18597bf08eab017d523ce7378cca517e967e/configs/config.yaml#L9)

Checking this out:

In [None]:
%%bash


gsutil cp -r gs://genetics-portal-input/luts/19.06_gene_symbol_synonym_map.json .

In [52]:
gene_data = (
    pd.read_json('19.06_gene_symbol_synonym_map.json', orient='records', lines=True)
)

print(
    failed_w_status.merge(gene_data, left_on='targetFromSourceId', right_on='gene_id', how='left')
    .drop(['gene_id', 'gene_synonyms'], axis=1)
    .to_markdown(index=False)
)

| targetFromSourceId   |   evidenceCount | status   |   version | gene_name   |   gene_chrom |   gene_start |   gene_end |
|:---------------------|----------------:|:---------|----------:|:------------|-------------:|-------------:|-----------:|
| ENSG00000269881      |              48 | retired  |        97 | AC004754.1  |           16 |       249547 |     269943 |
| ENSG00000130489      |              45 | retired  |        97 | SCO2        |           22 |     50523568 |   50525606 |
| ENSG00000254462      |              43 | retired  |        99 | TMX2-CTNND1 |           11 |     57712605 |   57791586 |
| ENSG00000283932      |              42 | retired  |        97 | AL121722.1  |           20 |     22560553 |   22584261 |
| ENSG00000278272      |              36 | retired  |        97 | HIST1H3C    |            6 |     26045411 |   26045821 |
| ENSG00000270898      |              34 | retired  |        97 | GPR75-ASB3  |            2 |     53670293 |   53860160 |
| ENSG0000027850

Update from Jeremy:

```
It's worth noting that there may be other locations where a specific version of the gene mapping has been used. E.g. this is in Miguel's config file:

ensembl {
lut = ${input}"/lut/homo_sapiens_core_96_38_genes.json"
}

It's located here:
gs://genetics-portal-data/lut/homo_sapiens_core_96_38_genes.json

This file is also used in the L2G pipeline:
https://github.com/opentargets/genetics-l2g-scoring/blob/master/1_feature_engineering/1_prepare_inputs.py
```

In [None]:
%%bash

# Fetching the above file from the bucket:
gsutil cp -r gs://genetics-portal-data/lut/homo_sapiens_core_96_38_genes.json .


In [53]:
# Reding as dataframe:
gene_set = (
    pd.read_json('homo_sapiens_core_96_38_genes.json',orient='records', lines=True)
    .drop(['description', 'biotype', 'fwdstrand', 'exons'], axis=1)
)

gene_set.head()

Unnamed: 0,gene_id,gene_name,chr,tss,start,end
0,ENSG00000223972,DDX11L1,1,11869,11869,14409
1,ENSG00000227232,WASH7P,1,29570,14404,29570
2,ENSG00000278267,MIR6859-1,1,17436,17369,17436
3,ENSG00000243485,MIR1302-2HG,1,29554,29554,31109
4,ENSG00000284332,MIR1302-2,1,30366,30366,30503


In [57]:
print(
    failed_w_status.merge(gene_set, left_on='targetFromSourceId', right_on='gene_id', how='left')
    .drop(['gene_id', 'tss', 'start', 'end'], axis=1)
    .to_markdown()
)

|    | targetFromSourceId   |   evidenceCount | status   |   version | gene_name   |   chr |
|---:|:---------------------|----------------:|:---------|----------:|:------------|------:|
|  0 | ENSG00000269881      |              48 | retired  |        97 | AC004754.1  |    16 |
|  1 | ENSG00000130489      |              45 | retired  |        97 | SCO2        |    22 |
|  2 | ENSG00000254462      |              43 | retired  |        99 | TMX2-CTNND1 |    11 |
|  3 | ENSG00000283932      |              42 | retired  |        97 | AL121722.1  |    20 |
|  4 | ENSG00000278272      |              36 | retired  |        97 | HIST1H3C    |     6 |
|  5 | ENSG00000270898      |              34 | retired  |        97 | GPR75-ASB3  |     2 |
|  6 | ENSG00000278500      |              32 | retired  |        97 | AC009336.2  |     2 |
|  7 | ENSG00000261833      |              23 | retired  |        97 | AC104151.1  |    16 |
|  8 | ENSG00000274267      |              22 | retired  |        97 |