Skip to content

Commit

Permalink
Add bedtabix tests for plaintext bed adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Jun 8, 2022
1 parent 8df7c74 commit 29134a1
Show file tree
Hide file tree
Showing 20 changed files with 1,374 additions and 204 deletions.
2 changes: 2 additions & 0 deletions plugins/bed/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@
},
"dependencies": {
"@babel/runtime": "^7.17.9",
"@flatten-js/interval-tree": "^1.0.15",
"@gmod/bbi": "^1.0.35",
"@gmod/bed": "^2.0.6",
"@gmod/bgzf-filehandle": "^1.4.3",
"@gmod/tabix": "^1.5.2"
},
"peerDependencies": {
Expand Down
170 changes: 170 additions & 0 deletions plugins/bed/src/BedAdapter/BedAdapter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import { toArray } from 'rxjs/operators'
import BedAdapter from './BedAdapter'
import MyConfigSchema from './configSchema'

import { TextDecoder } from 'web-encoding'
if (!window.TextDecoder) {
window.TextDecoder = TextDecoder
}

test('adapter can fetch features from volvox-bed12.bed', async () => {
const adapter = new BedAdapter(
MyConfigSchema.create({
bedLocation: {
localPath: require.resolve('./test_data/volvox-bed12.bed'),
locationType: 'LocalPathLocation',
},
}),
)

const features = adapter.getFeatures({
refName: 'ctgA',
start: 0,
end: 20000,
assemblyName: 'volvox',
})
expect(await adapter.hasDataForRefName('ctgA')).toBe(true)
expect(await adapter.hasDataForRefName('ctgB')).toBe(false)

const featuresArray = await features.pipe(toArray()).toPromise()
const featuresJsonArray = featuresArray.map(f => f.toJSON())
expect(featuresJsonArray.slice(0, 10)).toMatchSnapshot()
})

test('adapter can fetch features from volvox.sort.bed simple bed3', async () => {
const adapter = new BedAdapter(
MyConfigSchema.create({
bedLocation: {
localPath: require.resolve('./test_data/volvox.sort.bed'),
locationType: 'LocalPathLocation',
},
}),
)

const features = adapter.getFeatures({
refName: 'contigA',
start: 0,
end: 20000,
assemblyName: 'volvox',
})
expect(await adapter.hasDataForRefName('contigA')).toBe(true)
expect(await adapter.hasDataForRefName('ctgB')).toBe(false)

const featuresArray = await features.pipe(toArray()).toPromise()
const featuresJsonArray = featuresArray.map(f => f.toJSON())
expect(featuresJsonArray.slice(0, 10)).toMatchSnapshot()
})

test('adapter can fetch features bed with autosql', async () => {
const adapter = new BedAdapter(
MyConfigSchema.create({
bedLocation: {
localPath: require.resolve('./test_data/volvox-autosql.bed'),
locationType: 'LocalPathLocation',
},

autoSql: `table gdcCancer
"somatic variants converted from MAF files obtained through the NCI GDC"
(
string chrom; "Chromosome (or contig, scaffold, etc.)"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Name of item"
uint score; "Score from 0-1000"
char[1] strand; "+ or -"
uint thickStart; "Start of where display should be thick (start codon)"
uint thickEnd; "End of where display should be thick (stop codon)"
uint reserved; "Used as itemRgb as of 2004-11-22"
int blockCount; "Number of blocks"
int[blockCount] blockSizes; "Comma separated list of block sizes"
int[blockCount] chromStarts; "Start positions relative to chromStart"
string sampleCount; "Number of samples with this variant"
string freq; "Variant frequency"
lstring Hugo_Symbol; "Hugo symbol"
lstring Entrez_Gene_Id; "Entrez Gene Id"
lstring Variant_Classification; "Class of variant"
lstring Variant_Type; "Type of variant"
lstring Reference_Allele; "Reference allele"
lstring Tumor_Seq_Allele1; "Tumor allele 1"
lstring Tumor_Seq_Allele2; "Tumor allele 2"
lstring dbSNP_RS; "dbSNP RS number"
lstring dbSNP_Val_Status; "dbSNP validation status"
lstring days_to_death; "Number of days till death"
lstring cigarettes_per_day; "Number of cigarettes per day"
lstring weight; "Weight"
lstring alcohol_history; "Any alcohol consumption?"
lstring alcohol_intensity; "Frequency of alcohol consumption"
lstring bmi; "Body mass index"
lstring years_smoked; "Number of years smoked"
lstring height; "Height"
lstring gender; "Gender"
lstring project_id; "TCGA Project id"
lstring ethnicity; "Ethnicity"
lstring Tumor_Sample_Barcode; "Tumor sample barcode"
lstring Matched_Norm_Sample_Barcode; "Matcheds normal sample barcode"
lstring case_id; "Case ID number"
)`,
}),
)
const features = adapter.getFeatures({
refName: 'ctgA',
start: 0,
end: 20000,
assemblyName: 'volvox',
})
expect(await adapter.hasDataForRefName('ctgA')).toBe(true)
expect(await adapter.hasDataForRefName('ctgB')).toBe(false)

const featuresArray = await features.pipe(toArray()).toPromise()
const featuresJsonArray = featuresArray.map(f => f.toJSON())
expect(featuresJsonArray.slice(0, 10)).toMatchSnapshot()
})

test('adapter can fetch bed with header', async () => {
const adapter = new BedAdapter(
MyConfigSchema.create({
bedLocation: {
localPath: require.resolve('./test_data/volvox.sort.with.header.bed'),
locationType: 'LocalPathLocation',
},
}),
)

const features = adapter.getFeatures({
refName: 'contigA',
start: 0,
end: 20000,
assemblyName: 'volvox',
})
expect(await adapter.hasDataForRefName('contigA')).toBe(true)
expect(await adapter.hasDataForRefName('ctgB')).toBe(false)

const featuresArray = await features.pipe(toArray()).toPromise()
const featuresJsonArray = featuresArray.map(f => f.toJSON())
expect(featuresJsonArray.slice(0, 10)).toMatchSnapshot()
})

test('adapter can use gwas header', async () => {
const adapter = new BedAdapter(
MyConfigSchema.create({
bedLocation: {
localPath: require.resolve('./test_data/gwas.bed'),
locationType: 'LocalPathLocation',
},
colRef: 0,
colStart: 1,
colEnd: 1,
}),
)

const features = adapter.getFeatures({
refName: '1',
start: 0,
end: 100_000,
assemblyName: 'hg19',
})

const featuresArray = await features.pipe(toArray()).toPromise()
const featuresJsonArray = featuresArray.map(f => f.toJSON())
expect(featuresJsonArray.slice(0, 10)).toMatchSnapshot()
})
172 changes: 172 additions & 0 deletions plugins/bed/src/BedAdapter/BedAdapter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import BED from '@gmod/bed'
import {
BaseFeatureDataAdapter,
BaseOptions,
} from '@jbrowse/core/data_adapters/BaseAdapter'
import { openLocation } from '@jbrowse/core/util/io'
import { ObservableCreate } from '@jbrowse/core/util/rxjs'
import { Region, Feature } from '@jbrowse/core/util'
import { featureData } from '../util'
import IntervalTree from '@flatten-js/interval-tree'
import { unzip } from '@gmod/bgzf-filehandle'

function isGzip(buf: Buffer) {
return buf[0] === 31 && buf[1] === 139 && buf[2] === 8
}

export default class BedAdapter extends BaseFeatureDataAdapter {
protected bedFeatures?: Promise<{
header: string
features: Record<string, string[]>
parser: typeof BED
columnNames: string[]
scoreColumn: string
colRef: number
colStart: number
colEnd: number
}>

protected intervalTrees: {
[key: string]: Promise<IntervalTree | undefined> | undefined
} = {}

public static capabilities = ['getFeatures', 'getRefNames']

private async loadDataP(opts: BaseOptions = {}) {
const pm = this.pluginManager
const bedLoc = this.getConf('bedLocation')
const buf = await openLocation(bedLoc, pm).readFile(opts)
const buffer = isGzip(buf) ? await unzip(buf) : buf
// 512MB max chrome string length is 512MB
if (buffer.length > 536_870_888) {
throw new Error('Data exceeds maximum string length (512MB)')
}
const data = new TextDecoder('utf8', { fatal: true }).decode(buffer)
const lines = data.split('\n').filter(f => !!f)
const headerLines = []
let i = 0
for (; i < lines.length && lines[i].startsWith('#'); i++) {
headerLines.push(lines[i])
}
const header = headerLines.join('\n')
const features = {} as Record<string, string[]>
for (; i < lines.length; i++) {
const line = lines[i]
const tab = line.indexOf('\t')
const refName = line.slice(0, tab)
if (!features[refName]) {
features[refName] = []
}
features[refName].push(line)
}

const autoSql = this.getConf('autoSql') as string
const parser = new BED({ autoSql })
const columnNames = this.getConf('columnNames')
const scoreColumn = this.getConf('scoreColumn')
const colRef = this.getConf('colRef')
const colStart = this.getConf('colStart')
const colEnd = this.getConf('colEnd')

return {
header,
features,
parser,
columnNames,
scoreColumn,
colRef,
colStart,
colEnd,
}
}

private async loadData(opts: BaseOptions = {}) {
if (!this.bedFeatures) {
this.bedFeatures = this.loadDataP(opts).catch(e => {
this.bedFeatures = undefined
throw e
})
}

return this.bedFeatures
}

public async getRefNames(opts: BaseOptions = {}) {
const { features } = await this.loadData(opts)
return Object.keys(features)
}

async getHeader(opts: BaseOptions = {}) {
const { header } = await this.loadData(opts)
return header
}

async getNames() {
const { header, columnNames } = await this.loadData()
if (columnNames.length) {
return columnNames
}
const defs = header.split('\n').filter(f => !!f)
const defline = defs[defs.length - 1]
return defline?.includes('\t')
? defline
.slice(1)
.split('\t')
.map(field => field.trim())
: undefined
}

private async loadFeatureIntervalTreeHelper(refName: string) {
const { colRef, colStart, colEnd, features, parser, scoreColumn } =
await this.loadData()
const lines = features[refName]
if (!lines) {
return undefined
}
const names = await this.getNames()

const intervalTree = new IntervalTree()
const ret = lines.map((f, i) => {
const uniqueId = `${this.id}-${refName}-${i}`
return featureData(
f,
colRef,
colStart,
colEnd,
scoreColumn,
parser,
uniqueId,
names,
)
})

for (let i = 0; i < ret.length; i++) {
const obj = ret[i]
intervalTree.insert([obj.get('start'), obj.get('end')], obj)
}
return intervalTree
}

private async loadFeatureIntervalTree(refName: string) {
if (!this.intervalTrees[refName]) {
this.intervalTrees[refName] = this.loadFeatureIntervalTreeHelper(
refName,
).catch(e => {
this.intervalTrees[refName] = undefined
throw e
})
}
return this.intervalTrees[refName]
}

public getFeatures(query: Region, opts: BaseOptions = {}) {
return ObservableCreate<Feature>(async observer => {
const { start, end, refName } = query
const intervalTree = await this.loadFeatureIntervalTree(refName)
intervalTree?.search([start, end]).forEach(f => observer.next(f))
observer.complete()
}, opts.signal)
}

public freeResources(): void {}
}
Loading

0 comments on commit 29134a1

Please sign in to comment.