# Process Individual Genome Builds

## Step 1: Environment Setup

### Import Libraries

In [1]:
import os
import re
import warnings

import pandas as pd
from snps import SNPs
from tqdm import tqdm

from utils_io import capture_print

### Configure Environment

In [2]:
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

## Step 2: Process Individual Genome Builds

### Define Paths

In [3]:
mixed_path = '../data/ind_genomes/raw/mixed/'
build36_path = os.path.abspath('../data/ind_genomes/raw/36/')
build37_path = os.path.abspath('../data/ind_genomes/raw/37/')
build38_path = os.path.abspath('../data/ind_genomes/raw/38/')

mixed_files = os.listdir(mixed_path)
mixed_files = [f for f in mixed_files if f.endswith('.txt') and 'exome-vcf' not in f]

### Process Individual Genome Builds

In [4]:
for f in tqdm(mixed_files):
    user_id = int(re.match(r'.*user(\d+)', f)[1])
    file_id = int(re.match(r'.*file(\d+)', f)[1])
    try:
        with capture_print() as captured_output:
            snps = SNPs(os.path.join(mixed_path, f))
    except Exception as e:
        continue
    build = snps.build
    match build:
        case 36:
            out_path = f'{build36_path}/user.{user_id}.file.{file_id}.csv'
        case 37:
            out_path = f'{build37_path}/user.{user_id}.file.{file_id}.csv'
        case 38:
            out_path = f'{build38_path}/user.{user_id}.file.{file_id}.csv'
        case _:
            continue
    try:
        with capture_print() as captured_output:
            snps.to_csv(out_path)
    except Exception as e:
        continue

100%|██████████| 6820/6820 [3:47:55<00:00,  2.01s/it]   
