# Convert a Ensembl gene id to gene symbol

This is a code snippet I find really useful when performing RNA seq. I usually end up with a counts table with Ensembl ids and when I perform differential expression I usually have to convert them to gene symbols.

Below I have shown both python and an R implimentation of how this can be performed. 

# Python

In [3]:
import mygene
import pandas as pd

infile = pd.read_csv("data/gene_list.csv", header = 0)

infile



Unnamed: 0,Ensembl_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,ENSG00000115825,1349.126255,2.188251,0.224357,9.753452,1.78e-22,1.63e-18
1,ENSG00000129521,268.021678,2.765093,0.283863,9.740928,2.0200000000000002e-22,1.63e-18
2,ENSG00000174697,18166.48633,3.543247,0.375474,9.436718,3.85e-21,2.0700000000000002e-17
3,ENSG00000123999,2094.498832,1.849762,0.204355,9.051715,1.4099999999999999e-19,5.69e-16
4,ENSG00000111961,4006.239649,2.415983,0.269737,8.956815,3.3399999999999997e-19,1.08e-15
5,ENSG00000126803,460.390284,1.358336,0.152744,8.892897,5.949999999999999e-19,1.61e-15
6,ENSG00000169495,2369.954096,3.105755,0.35996,8.628059,6.24e-18,1.44e-14
7,ENSG00000111859,851.687123,1.611828,0.193185,8.343454,7.220000000000001e-17,1.46e-13
8,ENSG00000165105,772.740123,2.161641,0.260942,8.284002,1.19e-16,2.14e-13


In [19]:
import mygene
mg = mygene.MyGeneInfo()
ens = infile["Ensembl_id"]
print ens
ginfo = mg.querymany(ens, scopes='ensembl.gene')

gene_symbol = []
gene_name = []
for g in ginfo:
    for k, v in g.iteritems():
        if k == "symbol":
            gene_symbol.append(v)
        elif k == "name":
            gene_name.append(v)
        
infile["Symbol"] = gene_symbol
infile["Name"] = gene_name

infile

0    ENSG00000115825
1    ENSG00000129521
2    ENSG00000174697
3    ENSG00000123999
4    ENSG00000111961
5    ENSG00000126803
6    ENSG00000169495
7    ENSG00000111859
8    ENSG00000165105
Name: Ensembl_id, dtype: object
querying 1-9...done.
Finished.


Unnamed: 0,Ensembl_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Symbol,Name
0,ENSG00000115825,1349.126255,2.188251,0.224357,9.753452,1.78e-22,1.63e-18,PRKD3,protein kinase D3
1,ENSG00000129521,268.021678,2.765093,0.283863,9.740928,2.0200000000000002e-22,1.63e-18,EGLN3,egl-9 family hypoxia inducible factor 3
2,ENSG00000174697,18166.48633,3.543247,0.375474,9.436718,3.85e-21,2.0700000000000002e-17,LEP,leptin
3,ENSG00000123999,2094.498832,1.849762,0.204355,9.051715,1.4099999999999999e-19,5.69e-16,INHA,inhibin alpha subunit
4,ENSG00000111961,4006.239649,2.415983,0.269737,8.956815,3.3399999999999997e-19,1.08e-15,SASH1,SAM and SH3 domain containing 1
5,ENSG00000126803,460.390284,1.358336,0.152744,8.892897,5.949999999999999e-19,1.61e-15,HSPA2,heat shock protein family A (Hsp70) member 2
6,ENSG00000169495,2369.954096,3.105755,0.35996,8.628059,6.24e-18,1.44e-14,HTRA4,HtrA serine peptidase 4
7,ENSG00000111859,851.687123,1.611828,0.193185,8.343454,7.220000000000001e-17,1.46e-13,NEDD9,"neural precursor cell expressed, developmental..."
8,ENSG00000165105,772.740123,2.161641,0.260942,8.284002,1.19e-16,2.14e-13,RASEF,RAS and EF-hand domain containing


# R

In [25]:
# Add R magic command from rpy2 to allow R commands to run in the notebook
%load_ext rpy2.ipython

In [60]:
%%R 
# %%R allows the whole cell to run as an R script

library(org.Hs.eg.db)


infile = read.csv("data/gene_list.csv")
data = infile[,"Ensembl_id"]

# You can see that the subset data is listed as a integer, we now need to convert
# this to a vector to pass it into the annotation mapping

print(typeof(data))
data = as.vector(data)

# Using the org.Hs.eg.db we set up mapping info - if you look at the documentation you
# can also obtain other keytypes

annots <- select(org.Hs.eg.db, keys=data, 
                columns="SYMBOL", keytype="ENSEMBL")

result <- merge(infile, annots, by.x="Ensembl_id", by.y="ENSEMBL")

print(result)


[1] "integer"
       Ensembl_id   baseMean log2FoldChange     lfcSE     stat   pvalue
1 ENSG00000111859   851.6871       1.611828 0.1931847 8.343454 7.22e-17
2 ENSG00000111961  4006.2396       2.415983 0.2697368 8.956815 3.34e-19
3 ENSG00000115825  1349.1263       2.188251 0.2243566 9.753452 1.78e-22
4 ENSG00000123999  2094.4988       1.849762 0.2043549 9.051715 1.41e-19
5 ENSG00000126803   460.3903       1.358336 0.1527440 8.892897 5.95e-19
6 ENSG00000129521   268.0217       2.765093 0.2838634 9.740928 2.02e-22
7 ENSG00000165105   772.7401       2.161641 0.2609417 8.284002 1.19e-16
8 ENSG00000169495  2369.9541       3.105755 0.3599599 8.628059 6.24e-18
9 ENSG00000174697 18166.4863       3.543247 0.3754745 9.436718 3.85e-21
      padj SYMBOL
1 1.46e-13  NEDD9
2 1.08e-15  SASH1
3 1.63e-18  PRKD3
4 5.69e-16   INHA
5 1.61e-15  HSPA2
6 1.63e-18  EGLN3
7 2.14e-13  RASEF
8 1.44e-14  HTRA4
9 2.07e-17    LEP
