In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import sqlite3
import gffutils
import time
import pybedtools
import gtf_to_df
import os

### Reading GTF into pandas dataframe using pyranges

In [2]:
%timeit human_df = pr.read_gtf("Homo_sapiens.GRCh38.112.chr.gtf",as_df=True)

1min 48s ± 4.59 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Reading GTF into pandas dataframe using custom python function

In [2]:
%timeit human_df2 = gtf_to_df.dataframe("Homo_sapiens.GRCh38.112.chr.gtf") # 341s

### Converting pandas dataframe to in-memory SQLite database

In [3]:
human_df = pr.read_gtf("Homo_sapiens.GRCh38.112.chr.gtf",as_df=True)
conn = sqlite3.connect(":memory:")
%timeit human_df.to_sql("human", conn, if_exists="replace", index=False)
cur = conn.cursor()

47.5 s ± 3.56 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### C++ code for reading GTF into in-memory SQLite database

In [5]:
os.system("g++ gtf_to_sql.cpp -lsqlite3")
os.system("./a.out")
# %timeit os.system("./a.out")

0

In [2]:
dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}
names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split()
human_df3 = pd.read_csv(
        "Homo_sapiens.GRCh38.112.chr.gtf",
        sep="\t",
        header=None,
        names=names,
        dtype=dtypes,  # type: ignore
        skiprows=pr.readers.skiprows("Homo_sapiens.GRCh38.112.chr.gtf"),
        nrows=None,
    )
_to_rows = pr.readers.to_rows
extra = _to_rows(human_df3.Attribute, ignore_bad=False)
human_df3 = human_df3.drop("Attribute", axis=1)
extra.set_index(human_df3.index, inplace=True)
human_df3 = pd.concat([human_df3, extra], axis=1, sort=False) # 100s

In [2]:
dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}
names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split()
human_df4 = pd.read_csv(
        "Homo_sapiens.GRCh38.112.chr.gtf",
        sep="\t",
        header=None,
        names=names,
        dtype=dtypes,  # type: ignore
        skiprows=pr.readers.skiprows("Homo_sapiens.GRCh38.112.chr.gtf"),
        nrows=None,
    )

In [3]:
loldf = pd.DataFrame.from_records(human_df4.Attribute.apply(lambda x: {k: v for k, v in pr.readers.parse_kv_fields(x)}))

In [4]:
human_df4 = human_df4.drop("Attribute", axis=1)
loldf.set_index(human_df4.index, inplace=True)
human_df4 = pd.concat([human_df4, loldf], axis=1, sort=False) # 100s