In [2]:
# Imports
from pathlib import Path
import pandas as pd
from GenomeSigInfer import download
from GenomeSigInfer.sbs import SBSMatrixGenerator

In [3]:
# Variables
folder_ref_genome_download = Path("../ref_genome")
# Reference genome
ref_genome = "GRCh37"

# Download the reference genome 'GRCh37'
Download the reference genome to the 'ref_genome' folder

In [5]:
download.download_ref_genome(folder_ref_genome_download, genome=ref_genome)

2023-12-03 18:44:29 - INFO - Beginning downloading of reference GRCh37. This may take up to 40 minutes to complete.
2023-12-03 18:49:17 - INFO - Finished downloading the file
2023-12-03 18:49:46 - INFO - Finished downloading GRCh37 to '..\ref_genome'!


# Create SBS Files

Create mutliple SBS files. With increasing context.

The sbs.96.txt file contains all of the following the pyrimidine single nucleotide variants, N[{C > A, G, or T} or {T > A, G, or C}]N. 4 possible starting nucleotides x 6 pyrimidine variants x 4 ending nucleotides = 96 total combinations.

The sbs.1536.txt file contains all of the following the pyrimidine single nucleotide variants, NN[{C > A, G, or T} or {T > A, G, or C}]NN. 16 (4x4) possible starting nucleotides x 6 pyrimidine variants x 16 (4x4) possible ending nucleotides = 1536 total combinations.

The sbs.24576.txt file contains all of the following the pyrimidine single nucleotide variants, NNN[{C > A, G, or T} or {T > A, G, or C}]NNN. 64 (4x4x4) possible starting nucleotides x 6 pyrimidine variants x 64 (4x4x4) possible ending dinucleotides = 24576 total combinations.

In [4]:
# The folder of the GRCh37 files
folder_ref_genome = folder_ref_genome_download / ref_genome
# Folder where the SBS files will be saved to
folder_sbs = "../SBS"
# List of the VCF files
vcf_files = ["../../data/vcf/WES_Other.20180327.simple", "../../data/vcf/WGS_Other.20180413.simple"]

In [4]:
SBSMatrixGenerator.generate_sbs_matrix(folder_sbs, vcf_files, folder_ref_genome, ref_genome)

2023-12-03 18:56:57 - INFO - Creating SBS matrices!
2023-12-03 18:56:57 - INFO - Processing VCF files: ..\..\data\vcf\WES_Other.20180327.simple, ..\..\data\vcf\WGS_Other.20180413.simple
2023-12-03 18:57:45 - INFO - Created a large VCF containing 13093659 mutations
2023-12-03 18:58:00 - INFO - Starting on chromosome 1
100%|██████████| 1014649/1014649 [35:34<00:00, 475.41it/s]
2023-12-03 19:33:35 - INFO - Finished chromosome 1

2023-12-03 19:33:35 - INFO - Starting on chromosome 2
100%|██████████| 1114031/1114031 [38:48<00:00, 478.49it/s]
2023-12-03 20:12:23 - INFO - Finished chromosome 2

2023-12-03 20:12:23 - INFO - Starting on chromosome 3
100%|██████████| 928869/928869 [32:21<00:00, 478.36it/s]
2023-12-03 20:44:45 - INFO - Finished chromosome 3

2023-12-03 20:44:45 - INFO - Starting on chromosome 4
100%|██████████| 953436/953436 [33:24<00:00, 475.64it/s]
2023-12-03 21:18:10 - INFO - Finished chromosome 4

2023-12-03 21:18:10 - INFO - Starting on chromosome 5
100%|██████████| 871986/8

In [27]:
df = pd.read_parquet(Path(folder_sbs) / 'sbs.96.parquet')
df = df[[df.columns[0]] + sorted(df.columns[1:])]
df.iloc[:5, [0, -3, -2, -1]]

Unnamed: 0,MutationType,Thy-AdenoCa::PTC-73C,Thy-AdenoCa::PTC-7C,Thy-AdenoCa::PTC-88C
0,A[C>A]A,8.0,35.0,6.0
1,A[C>A]C,2.0,26.0,2.0
2,A[C>A]G,0.0,19.0,2.0
3,A[C>A]T,2.0,20.0,1.0
4,A[C>G]A,5.0,36.0,6.0


In [28]:
df = pd.read_parquet(Path(folder_sbs) / 'sbs.1536.parquet')
df = df[[df.columns[0]] + sorted(df.columns[1:])]
df.iloc[:5, [0, -3, -2, -1]]

Unnamed: 0,MutationType,Thy-AdenoCa::PTC-73C,Thy-AdenoCa::PTC-7C,Thy-AdenoCa::PTC-88C
0,AA[C>A]AA,3.0,7.0,3.0
1,AA[C>A]AC,0.0,1.0,0.0
2,AA[C>A]AG,0.0,1.0,0.0
3,AA[C>A]AT,0.0,1.0,0.0
4,AA[C>A]CA,0.0,2.0,0.0


In [29]:
df = pd.read_parquet(Path(folder_sbs) / 'sbs.24576.parquet')
df = df[[df.columns[0]] + sorted(df.columns[1:])]
df.iloc[:5, [0, -3, -2, -1]]

Unnamed: 0,MutationType,Thy-AdenoCa::PTC-73C,Thy-AdenoCa::PTC-7C,Thy-AdenoCa::PTC-88C
0,AAA[C>A]AAA,1.0,4.0,1.0
1,AAA[C>A]AAC,0.0,0.0,0.0
2,AAA[C>A]AAG,0.0,1.0,0.0
3,AAA[C>A]AAT,0.0,1.0,0.0
4,AAA[C>A]ACA,0.0,0.0,0.0
