In [23]:
import io
import os
import numpy as np
import pandas as pd
import gzip

In [24]:
!pwd

/root/autodl-tmp/SNP


In [25]:
def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                vcf_names = [x for x in line.split('\t')]
                break
    ifile.close()
    return vcf_names


def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

def in_between(position, relevent):
    appears = False
    for i in range(len(relevent)):
        row = relevent.iloc[i]
        if (position >= relevent.iloc[i].start) and (position <= relevent.iloc[i].end):
            appears = True
    return appears

In [26]:
!rm -rf filtered_vcfs/.ipynb_checkpoints

In [27]:
files = os.listdir("filtered_vcfs/")

files

['ADNI.808_indiv.minGQ_21.pass.ADNI_ID.chr23_filtered.recode.vcf.gz',
 'ADNI.808_indiv.minGQ_21.pass.ADNI_ID.chr8_filtered.recode.vcf.gz',
 'ADNI.808_indiv.minGQ_21.pass.ADNI_ID.chr9_filtered.recode.vcf.gz']

In [28]:
# !gzip -c filtered_vcfs/ADNI.808_indiv.minGQ_21.pass.ADNI_ID.chr1_filtered.recode.vcf > filtered_vcfs/ADNI.808_indiv.minGQ_21.pass.ADNI_ID.chr1_filtered.recode.vcf.gz

In [29]:
genes = pd.read_csv("gene_list.csv")
files = os.listdir("filtered_vcfs/")

for vcf_file in files:
    file_name = "filtered_vcfs/" + vcf_file
    
    with open('log.txt', 'a') as output_file:
        output_file.write(file_name + "\n")
    
    names = get_vcf_names(file_name)
    
    vcf = pd.read_csv(file_name, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)
    vcf = pd.concat(vcf, ignore_index=True)
    
    start = vcf_file.find("ADNI_ID.") + len("ADNI_ID.")
    end = vcf_file.find("_filtered")
    substring = vcf_file[start:end]
    
    relevent = genes[genes["chrom"] == substring]
    relevent = relevent.reset_index()
    
    positions = vcf["POS"]
    
    indexes = []
    for i in range(len(positions)):
        
        boo = in_between(positions[i], relevent)
        if i % 500 == 0:
            output_file = open('log.txt','a')
            output_file.write(" " + str(boo) + " ")
            output_file.close()
        if boo:
            indexes.append(i)
    print(indexes)
    if len(indexes) != 0:
        df = vcf.iloc[indexes]
        df.to_pickle(vcf_file[:-7] + ".pkl")

  vcf = pd.read_csv(file_name, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)


[]


  vcf = pd.read_csv(file_name, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)


[91869, 91870, 91871, 91872, 91873, 91874, 91875, 91876, 91877, 91878, 91879, 91880, 91881, 91882, 91883, 91884, 91885, 91886, 91887, 91888, 91889, 91890, 91891, 91892, 91893, 91894, 91895, 91896, 91897, 91898, 91899, 91900, 91901, 91902, 91903, 91904, 91905, 91906, 91907, 91908, 91909, 91910, 91911, 91912, 91913, 91914, 91915, 91916, 91917, 91918, 91919, 91920, 91921, 91922, 91923, 91924, 91925, 91926, 91927, 91928, 91929, 91930, 91931, 91932, 91933, 91934, 91935, 91936, 91937, 91938, 91939, 91940, 91941, 91942, 91943, 91944, 91945, 91946, 91947, 91948, 91949, 91950, 91951, 91952, 91953, 91954, 91955, 91956, 91957, 91958, 91959, 91960, 91961, 91962, 91963, 91964, 91965, 91966, 91967, 91968, 91969, 91970, 91971, 91972, 91973, 91974, 91975, 91976, 91977, 91978, 91979, 91980, 91981, 91982, 91983, 91984, 91985, 91986, 91987, 91988, 91989, 91990, 91991, 91992, 91993, 91994, 91995, 91996, 91997, 91998, 91999, 92000, 92001, 92002, 92003, 92004, 92005, 92006, 92007, 92008, 92009, 92010, 92011

  vcf = pd.read_csv(file_name, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)


[12301, 12302, 12303, 12304, 12305, 12306, 12307, 12308, 12309, 12310, 12311, 12312, 12313, 12314, 12315, 12316, 12317, 12318, 12319, 12320, 12321, 12322, 12323, 12324, 12325, 12326, 12327, 12328, 12329, 12330, 12331, 12332, 12333, 12334, 12335, 12336, 12337, 12338, 12339, 12340, 12341, 12342, 12343, 12344, 12345, 12346, 12347, 12348, 12349, 12350, 12351, 12352, 12353, 12354, 12355, 12356, 12357, 12358, 12359, 12360, 12361, 12362, 12363, 12364, 12365, 12366, 12367, 12368, 12369, 12370, 12371, 12372, 12373, 12374, 12375, 12376, 12377, 12378, 12379, 12380, 12381, 12382, 12383, 12384, 12385, 12386, 12387, 12388, 12389, 12390, 12391, 12392, 12393, 12394, 12395, 12396, 12397, 12398, 12399, 12400, 12401, 12402, 12403, 12404, 12405, 12406, 12407, 12408, 12409, 12410, 12411, 12412, 12413, 12414, 12415, 12416, 12417, 12418, 12419, 12420, 12421, 12422, 12423, 12424, 12425, 12426, 12427, 12428, 12429, 12430, 12431, 12432, 12433, 12434, 12435, 12436, 12437, 12438, 12439, 12440, 12441, 12442, 12443