In [1]:
import os
import gzip
import wget
import requests

Using only Python3

In [2]:
# https://www.ensembl.org/Homo_sapiens/Info/Index
# http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/
# chr.gtf.gz without GL and GI unlocalized regions

#os.makedirs('data', exist_ok=True)
#url = 'http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.chr.gtf.gz'
#r = requests.get(url, allow_redirects=True)
filename = "./data/Homo_sapiens.GRCh38.104.chr.gtf.gz"
#open(filename, "wb").write(r.content)

In [3]:
with gzip.open(filename) as f:
    print(f.readlines()[0:10])

[b'#!genome-build GRCh38.p13\n', b'#!genome-version GRCh38\n', b'#!genome-date 2013-12\n', b'#!genome-build-accession GCA_000001405.28\n', b'#!genebuild-last-updated 2021-03\n', b'1\tensembl_havana\tgene\t685679\t686673\t.\t-\t.\tgene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding";\n', b'1\tensembl_havana\ttranscript\t685679\t686673\t.\t-\t.\tgene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";\n', b'1\tensembl_havana\texon\t685679\t686673\t.\t-\t.\tgene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "

Using bash commands through Jupyter magic comand `%%bash` and `!`

In [4]:
#%%bash

#mkdir -p data/

#wget -q -P data/ http://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.chr.gtf.gz

In [5]:
#! zcat < data/Homo_sapiens.GRCh38.104.chr.gtf.gz | head

In [6]:
import pandas as pd

The GTF (General Transfer Format) is an extension of GFF version 2 
and used to represent transcription models. GFF (General Feature Format) 
consists of one line per feature, each containing 9 columns of data. 

**Fields**

Fields are tab-separated. Also, all but the final field in each 
feature line must contain a value; "empty" columns are denoted 
with a '.'

    seqname   - name of the chromosome or scaffold; chromosome names 
                without a 'chr' 
    source    - name of the program that generated this feature, or 
                the data source (database or project name)
    feature   - feature type name. Current allowed features are
                {gene, transcript, exon, CDS, Selenocysteine, start_codon,
                stop_codon and UTR}
    start     - start position of the feature, with sequence numbering 
                starting at 1.
    end       - end position of the feature, with sequence numbering 
                starting at 1.
    score     - a floating point value indiciating the score of a feature
    strand    - defined as + (forward) or - (reverse).
    frame     - one of '0', '1' or '2'. Frame indicates the number of base pairs
                before you encounter a full codon. '0' indicates the feature 
                begins with a whole codon. '1' indicates there is an extra
                base (the 3rd base of the prior codon) at the start of this feature.
                '2' indicates there are two extra bases (2nd and 3rd base of the 
                prior exon) before the first codon. All values are given with
                relation to the 5' end.
    attribute - a semicolon-separated list of tag-value pairs (separated by a space), 
                providing additional information about each feature. A key can be
                repeated multiple times.

**Attributes**

The following attributes are available. All attributes are semi-colon
separated pairs of keys and values.

    - gene_id: The stable identifier for the gene
    - gene_version: The stable identifier version for the gene
    - gene_name: The official symbol of this gene
    - gene_source: The annotation source for this gene
    - gene_biotype: The biotype of this gene
    - transcript_id: The stable identifier for this transcript
    - transcript_version: The stable identifier version for this transcript
    - transcript_name: The symbold for this transcript derived from the gene name
    - transcript_source: The annotation source for this transcript
    - transcript_biotype: The biotype for this transcript
    - exon_id: The stable identifier for this exon
    - exon_version: The stable identifier version for this exon
    - exon_number: Position of this exon in the transcript
    - ccds_id: CCDS identifier linked to this transcript
    - protein_id: Stable identifier for this transcript's protein
    - protein_version: Stable identifier version for this transcript's protein
    - tag: A collection of additional key value tags
    - transcript_support_level: Ranking to assess how well a transcript is supported (from 1 to 5)

**Tags**

Tags are additional flags used to indicate attibutes of the transcript.

    - CCDS: Flags this transcript as one linked to a CCDS record
    - seleno: Flags this transcript has a Selenocysteine edit.
    - cds_end_NF: the coding region end could not be confirmed
    - cds_start_NF: the coding region start could not be confirmed
    - mRNA_end_NF: the mRNA end could not be confirmed
    - mRNA_start_NF: the mRNA start could not be confirmed.
    - basic: the transcript is part of the gencode basic geneset

In [7]:
df = pd.read_csv('data/Homo_sapiens.GRCh38.104.chr.gtf.gz', 
            sep='\t', 
            comment='#',
            header=None,
            dtype=str,
            names=['seqname', 'source', 'feature', 'start', 
                   'end', 'score', 'strand', 'frame', 
                   'attribute'])

In [8]:
df.shape

(3145418, 9)

In [9]:
df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g..."
1,1,ensembl_havana,transcript,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
2,1,ensembl_havana,exon,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
3,1,ensembl_havana,CDS,685719,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
4,1,ensembl_havana,start_codon,686652,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."


In [10]:
df.tail()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
3145413,MT,insdc,transcript,15888,15953,.,+,.,"gene_id ""ENSG00000210195""; gene_version ""2""; t..."
3145414,MT,insdc,exon,15888,15953,.,+,.,"gene_id ""ENSG00000210195""; gene_version ""2""; t..."
3145415,MT,insdc,gene,15956,16023,.,-,.,"gene_id ""ENSG00000210196""; gene_version ""2""; g..."
3145416,MT,insdc,transcript,15956,16023,.,-,.,"gene_id ""ENSG00000210196""; gene_version ""2""; t..."
3145417,MT,insdc,exon,15956,16023,.,-,.,"gene_id ""ENSG00000210196""; gene_version ""2""; t..."


In [11]:
df.dtypes

seqname      object
source       object
feature      object
start        object
end          object
score        object
strand       object
frame        object
attribute    object
dtype: object

In [12]:
df.isna().sum()

seqname      0
source       0
feature      0
start        0
end          0
score        0
strand       0
frame        0
attribute    0
dtype: int64

In [13]:
df.isna().sum().sum()

0

In [14]:
df.columns

Index(['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand',
       'frame', 'attribute'],
      dtype='object')

In [15]:
df['seqname'].unique()

array(['1', '2', '3', '4', '5', '6', '7', 'X', '8', '9', '11', '10', '12',
       '13', '14', '15', '16', '17', '18', '20', '19', 'Y', '22', '21',
       'MT'], dtype=object)

In [16]:
len(df['seqname'].unique())

25

In [17]:
for x in df.columns:
    print(x, len(df[x].unique()))

seqname 25
source 7
feature 9
start 639387
end 639576
score 1
strand 2
frame 4
attribute 2794860


In [18]:
[[x, len(df[x].unique())] for x in df.columns]

[['seqname', 25],
 ['source', 7],
 ['feature', 9],
 ['start', 639387],
 ['end', 639576],
 ['score', 1],
 ['strand', 2],
 ['frame', 4],
 ['attribute', 2794860]]

In [19]:
for x in df.columns:
    if len(df[x].unique()) < 30:
        print(x, df[x].unique())

seqname ['1' '2' '3' '4' '5' '6' '7' 'X' '8' '9' '11' '10' '12' '13' '14' '15'
 '16' '17' '18' '20' '19' 'Y' '22' '21' 'MT']
source ['ensembl_havana' 'havana' 'ensembl' 'havana_tagene'
 'ensembl_havana_tagene' 'mirbase' 'insdc']
feature ['gene' 'transcript' 'exon' 'CDS' 'start_codon' 'stop_codon'
 'five_prime_utr' 'three_prime_utr' 'Selenocysteine']
score ['.']
strand ['-' '+']
frame ['.' '0' '2' '1']


In [20]:
[[x, df[x].unique()] for x in df.columns if len(df[x].unique()) < 30 ]

[['seqname',
  array(['1', '2', '3', '4', '5', '6', '7', 'X', '8', '9', '11', '10', '12',
         '13', '14', '15', '16', '17', '18', '20', '19', 'Y', '22', '21',
         'MT'], dtype=object)],
 ['source',
  array(['ensembl_havana', 'havana', 'ensembl', 'havana_tagene',
         'ensembl_havana_tagene', 'mirbase', 'insdc'], dtype=object)],
 ['feature',
  array(['gene', 'transcript', 'exon', 'CDS', 'start_codon', 'stop_codon',
         'five_prime_utr', 'three_prime_utr', 'Selenocysteine'],
        dtype=object)],
 ['score', array(['.'], dtype=object)],
 ['strand', array(['-', '+'], dtype=object)],
 ['frame', array(['.', '0', '2', '1'], dtype=object)]]

In [21]:
df.groupby('feature')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fae2ec3a310>

In [22]:
df.groupby('feature').count()

Unnamed: 0_level_0,seqname,source,start,end,score,strand,frame,attribute
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CDS,824576,824576,824576,824576,824576,824576,824576,824576
Selenocysteine,119,119,119,119,119,119,119,119
exon,1498154,1498154,1498154,1498154,1498154,1498154,1498154,1498154
five_prime_utr,163233,163233,163233,163233,163233,163233,163233,163233
gene,60605,60605,60605,60605,60605,60605,60605,60605
start_codon,92911,92911,92911,92911,92911,92911,92911,92911
stop_codon,86172,86172,86172,86172,86172,86172,86172,86172
three_prime_utr,182795,182795,182795,182795,182795,182795,182795,182795
transcript,236853,236853,236853,236853,236853,236853,236853,236853


In [23]:
df.groupby('feature')['feature'].count()

feature
CDS                 824576
Selenocysteine         119
exon               1498154
five_prime_utr      163233
gene                 60605
start_codon          92911
stop_codon           86172
three_prime_utr     182795
transcript          236853
Name: feature, dtype: int64

In [24]:
[df.groupby(x)[x].count() for x in df.columns if len(df[x].unique()) < 30]

[seqname
 1     286554
 10    120959
 11    189276
 12    179072
 13     50861
 14    114641
 15    114177
 16    142494
 17    187948
 18     55262
 19    179208
 2     238910
 20     71221
 21     35568
 22     65887
 3     200212
 4     130868
 5     141226
 6     149532
 7     149512
 8     116505
 9     114334
 MT       144
 X     102923
 Y       8124
 Name: seqname, dtype: int64,
 source
 ensembl                   186372
 ensembl_havana            707584
 ensembl_havana_tagene        152
 havana                   2115992
 havana_tagene             129543
 insdc                        144
 mirbase                     5631
 Name: source, dtype: int64,
 feature
 CDS                 824576
 Selenocysteine         119
 exon               1498154
 five_prime_utr      163233
 gene                 60605
 start_codon          92911
 stop_codon           86172
 three_prime_utr     182795
 transcript          236853
 Name: feature, dtype: int64,
 score
 .    3145418
 Name: score, dtype: int

Crear un archivo bed para genes y otro para transcritos

In [25]:
df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g..."
1,1,ensembl_havana,transcript,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
2,1,ensembl_havana,exon,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
3,1,ensembl_havana,CDS,685719,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
4,1,ensembl_havana,start_codon,686652,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."


In [26]:
df['attribute'][0:10]

0    gene_id "ENSG00000284662"; gene_version "1"; g...
1    gene_id "ENSG00000284662"; gene_version "1"; t...
2    gene_id "ENSG00000284662"; gene_version "1"; t...
3    gene_id "ENSG00000284662"; gene_version "1"; t...
4    gene_id "ENSG00000284662"; gene_version "1"; t...
5    gene_id "ENSG00000284662"; gene_version "1"; t...
6    gene_id "ENSG00000284662"; gene_version "1"; t...
7    gene_id "ENSG00000284662"; gene_version "1"; t...
8    gene_id "ENSG00000186827"; gene_version "11"; ...
9    gene_id "ENSG00000186827"; gene_version "11"; ...
Name: attribute, dtype: object

In [27]:
# more examples of slicing

In [28]:
[x for x in df['attribute'][0:10]]

['gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding";',
 'gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";',
 'gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";',
 'gene_i

In [29]:
df['attribute'] == 'ENSG00000284662'

0          False
1          False
2          False
3          False
4          False
           ...  
3145413    False
3145414    False
3145415    False
3145416    False
3145417    False
Name: attribute, Length: 3145418, dtype: bool

In [30]:
df['attribute'].str.contains('ENSG00000284662')

0           True
1           True
2           True
3           True
4           True
           ...  
3145413    False
3145414    False
3145415    False
3145416    False
3145417    False
Name: attribute, Length: 3145418, dtype: bool

In [31]:
df[df['attribute'].str.contains('ENSG00000284662')]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g..."
1,1,ensembl_havana,transcript,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
2,1,ensembl_havana,exon,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
3,1,ensembl_havana,CDS,685719,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
4,1,ensembl_havana,start_codon,686652,686654,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
5,1,ensembl_havana,stop_codon,685716,685718,.,-,0,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
6,1,ensembl_havana,five_prime_utr,686655,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
7,1,ensembl_havana,three_prime_utr,685679,685715,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."


In [32]:
df = df.loc[df['feature'].isin(['gene', 'transcript'])].reset_index(drop=True)

In [33]:
df.head(10)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g..."
1,1,ensembl_havana,transcript,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
2,1,ensembl_havana,gene,1211340,1214153,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
3,1,ensembl_havana,transcript,1211340,1214153,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
4,1,havana,transcript,1211340,1214138,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
5,1,havana,transcript,1212019,1213498,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
6,1,ensembl_havana,gene,1203508,1206592,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ..."
7,1,ensembl_havana,transcript,1203508,1206571,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ..."
8,1,ensembl_havana,transcript,1203508,1206592,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ..."
9,1,havana,transcript,1203844,1205680,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ..."


In [34]:
[x for x in df[df['attribute'].str.contains('ENSG00000186827')]['attribute']]

['gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding";',
 'gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";',
 'gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";',
 'gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana

In [35]:
df = df[df['source'] == 'ensembl_havana'].reset_index(drop=True)

In [36]:
df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g..."
1,1,ensembl_havana,transcript,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; t..."
2,1,ensembl_havana,gene,1211340,1214153,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
3,1,ensembl_havana,transcript,1211340,1214153,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ..."
4,1,ensembl_havana,gene,1203508,1206592,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ..."


In [37]:
df['seqname'].unique()

array(['1', '2', '3', '4', '5', '6', '7', 'X', '8', '9', '11', '10', '12',
       '13', '14', '15', '16', '17', '18', '20', '19', 'Y', '22', '21'],
      dtype=object)

In [38]:
df['source'].unique()

array(['ensembl_havana'], dtype=object)

In [39]:
[[x, df[x].unique()] for x in df.columns if len(df[x].unique()) < 30 ]

[['seqname',
  array(['1', '2', '3', '4', '5', '6', '7', 'X', '8', '9', '11', '10', '12',
         '13', '14', '15', '16', '17', '18', '20', '19', 'Y', '22', '21'],
        dtype=object)],
 ['source', array(['ensembl_havana'], dtype=object)],
 ['feature', array(['gene', 'transcript'], dtype=object)],
 ['score', array(['.'], dtype=object)],
 ['strand', array(['-', '+'], dtype=object)],
 ['frame', array(['.'], dtype=object)]]

In [40]:
(df[df['feature'] == 'transcript']['attribute']
    .str.replace(';$', '', regex=True)
    .str.split(';', expand=True)
    .replace('.*"(.*?)"', '\\1', regex=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
1,ENSG00000284662,1,ENST00000332831,4,OR4F16,ensembl_havana,protein_coding,OR4F16-201,ensembl_havana,protein_coding,CCDS,CCDS41221,basic,NA (assigned to previous version 3),
3,ENSG00000186827,11,ENST00000379236,4,TNFRSF4,ensembl_havana,protein_coding,TNFRSF4-201,ensembl_havana,protein_coding,CCDS,CCDS11,basic,1 (assigned to previous version 3),
5,ENSG00000186891,14,ENST00000328596,10,TNFRSF18,ensembl_havana,protein_coding,TNFRSF18-201,ensembl_havana,protein_coding,CCDS,CCDS9,basic,1,
6,ENSG00000186891,14,ENST00000379268,7,TNFRSF18,ensembl_havana,protein_coding,TNFRSF18-203,ensembl_havana,protein_coding,CCDS,CCDS10,basic,1 (assigned to previous version 6),
8,ENSG00000160072,20,ENST00000673477,1,ATAD3B,ensembl_havana,protein_coding,ATAD3B-206,ensembl_havana,protein_coding,CCDS,CCDS30,basic,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45850,ENSG00000155313,16,ENST00000351097,9,USP25,ensembl_havana,protein_coding,USP25-203,ensembl_havana,protein_coding,basic,1,,,
45851,ENSG00000155313,16,ENST00000285681,6,USP25,ensembl_havana,protein_coding,USP25-202,ensembl_havana,protein_coding,CCDS,CCDS63337,basic,1,
45852,ENSG00000155313,16,ENST00000400183,7,USP25,ensembl_havana,protein_coding,USP25-204,ensembl_havana,protein_coding,CCDS,CCDS63336,basic,1 (assigned to previous version 6),
45854,ENSG00000276076,5,ENST00000619537,5,CRYAA2,ensembl_havana,protein_coding,CRYAA2-201,ensembl_havana,protein_coding,CCDS,CCDS82651,basic,1 (assigned to previous version 4),


In [41]:
(df[df['feature'] == 'gene']['attribute']
    .str.replace(';$', '', regex=True)
    .str.split(';', expand=True)
    .replace('.*"(.*?)"', '\\1', regex=True))

Unnamed: 0,0,1,2,3,4
0,ENSG00000284662,1,OR4F16,ensembl_havana,protein_coding
2,ENSG00000186827,11,TNFRSF4,ensembl_havana,protein_coding
4,ENSG00000186891,14,TNFRSF18,ensembl_havana,protein_coding
7,ENSG00000160072,20,ATAD3B,ensembl_havana,protein_coding
9,ENSG00000041988,15,THAP3,ensembl_havana,protein_coding
...,...,...,...,...,...
45843,ENSG00000159200,18,RCAN1,ensembl_havana,protein_coding
45846,ENSG00000142197,12,DOP1B,ensembl_havana,protein_coding
45848,ENSG00000155313,16,USP25,ensembl_havana,protein_coding
45853,ENSG00000276076,5,CRYAA2,ensembl_havana,protein_coding


In [43]:
del df

### BED genes

In [42]:
df_gene = df[df['feature'] == 'gene'].reset_index(drop=True)

In [44]:
df_gene['attribute'].str.replace(';$', '', regex=True).str.split(';', expand=True)

Unnamed: 0,0,1,2,3,4
0,"gene_id ""ENSG00000284662""","gene_version ""1""","gene_name ""OR4F16""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
1,"gene_id ""ENSG00000186827""","gene_version ""11""","gene_name ""TNFRSF4""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
2,"gene_id ""ENSG00000186891""","gene_version ""14""","gene_name ""TNFRSF18""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
3,"gene_id ""ENSG00000160072""","gene_version ""20""","gene_name ""ATAD3B""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
4,"gene_id ""ENSG00000041988""","gene_version ""15""","gene_name ""THAP3""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
...,...,...,...,...,...
19297,"gene_id ""ENSG00000159200""","gene_version ""18""","gene_name ""RCAN1""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
19298,"gene_id ""ENSG00000142197""","gene_version ""12""","gene_name ""DOP1B""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
19299,"gene_id ""ENSG00000155313""","gene_version ""16""","gene_name ""USP25""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""
19300,"gene_id ""ENSG00000276076""","gene_version ""5""","gene_name ""CRYAA2""","gene_source ""ensembl_havana""","gene_biotype ""protein_coding"""


In [45]:
# zcat Homo_sapiens.GRCh38.104.chr.gtf.gz | awk 'OFS="\t" {if ($3=="gene") {print $1,$4-1,$5,$10,$14,$7}}' | tr -d '";' > hg38.genes.bed 
# zcat Homo_sapiens.GRCh38.104.chr.gtf.gz | awk 'OFS="\t" {if ($3=="transcript") {print $1,$4-1,$5,$10,$24,$7}}' | tr -d '";' > hg38.transcripts.bed 

In [46]:
new_columns = ['gene_id', 'gene_version', 'gene_name']
df_gene[new_columns] = df_gene['attribute'].str.replace(';$', '', regex=True).str.split(';', expand=True).iloc[:, :3]

In [47]:
df_gene.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,gene_id,gene_version,gene_name
0,1,ensembl_havana,gene,685679,686673,.,-,.,"gene_id ""ENSG00000284662""; gene_version ""1""; g...","gene_id ""ENSG00000284662""","gene_version ""1""","gene_name ""OR4F16"""
1,1,ensembl_havana,gene,1211340,1214153,.,-,.,"gene_id ""ENSG00000186827""; gene_version ""11""; ...","gene_id ""ENSG00000186827""","gene_version ""11""","gene_name ""TNFRSF4"""
2,1,ensembl_havana,gene,1203508,1206592,.,-,.,"gene_id ""ENSG00000186891""; gene_version ""14""; ...","gene_id ""ENSG00000186891""","gene_version ""14""","gene_name ""TNFRSF18"""
3,1,ensembl_havana,gene,1471765,1497848,.,+,.,"gene_id ""ENSG00000160072""; gene_version ""20""; ...","gene_id ""ENSG00000160072""","gene_version ""20""","gene_name ""ATAD3B"""
4,1,ensembl_havana,gene,6624866,6635586,.,+,.,"gene_id ""ENSG00000041988""; gene_version ""15""; ...","gene_id ""ENSG00000041988""","gene_version ""15""","gene_name ""THAP3"""


In [48]:
df_gene = (df_gene[['seqname', 'start', 'end', 
                    'gene_id', 'score', 'strand', 'gene_name']]
                 .replace('"', "", regex=True)
                 .replace("gene_id", "", regex=True)
                 .replace("gene_name", "", regex=True))

In [49]:
pd.to_numeric(df_gene['start'])

0          685679
1         1211340
2         1203508
3         1471765
4         6624866
           ...   
19297    34513142
19298    36156782
19299    15729982
19300     6499203
19301    31118416
Name: start, Length: 19302, dtype: int64

In [50]:
df_gene['start'].astype('int64').head()

0     685679
1    1211340
2    1203508
3    1471765
4    6624866
Name: start, dtype: int64

In [51]:
pd.to_numeric(df_gene['start'])-1 

0          685678
1         1211339
2         1203507
3         1471764
4         6624865
           ...   
19297    34513141
19298    36156781
19299    15729981
19300     6499202
19301    31118415
Name: start, Length: 19302, dtype: int64

In [52]:
df_gene['start'] = df_gene['start'].astype('int64') - 1 

In [53]:
df_gene.head()

Unnamed: 0,seqname,start,end,gene_id,score,strand,gene_name
0,1,685678,686673,ENSG00000284662,.,-,OR4F16
1,1,1211339,1214153,ENSG00000186827,.,-,TNFRSF4
2,1,1203507,1206592,ENSG00000186891,.,-,TNFRSF18
3,1,1471764,1497848,ENSG00000160072,.,+,ATAD3B
4,1,6624865,6635586,ENSG00000041988,.,+,THAP3


In [54]:
df_gene.shape

(19302, 7)

#### Atributos de Pandas
- `df.shape`

#### Métodos de pandas

- `df.head()`
- `df.tail()`
- `df.isna()`
- `df.sum()`
- `df.groupby()`
- `df.count()`
- `df[x].unique()`
- `df.reset_index()`
- `df.str.contains()`
- `df.isin()`
- `df.replace()`
- `df.str.replace()`
- `df.str.split()`
- `df.to_numeric()`




##### Funciones built-in
- `len()`
- `print()`
