In [None]:
import sys
import os
import pandas as pd
from tqdm import tqdm

In [None]:
main_path = os.path.abspath('')

pheno = "bmi"

# paths
## input paths

vcf_path = f"./data/ext_prs.90k.{pheno}.vcf"
target_path = f"./data/phenotype.{pheno}.unordered"
ordered_target_path = f"./data/phenotype.{pheno}.ordered"
unordered_covariates_path = "./data/cov.unordered"
ordered_covariates_path = f"./data/cov.{pheno}.ordered"
snp_input_path = f"./data/snps.{pheno}.tsv"

## output paths
target_output_path = os.path.join(main_path, "data", f"target_{pheno}.csv")
temporary_matrix_path = os.path.join(
    main_path, "data", f"temp_feature_matrix_{pheno}.csv")
transposed_feature_matrix_path = os.path.join(
    main_path, "data", f"feature_matrix_{pheno}.csv")
feature_cov_path = os.path.join(
    main_path, "data", f"feature_cov_matrix_{pheno}.csv")
feature_cov_hla_path = os.path.join(
    main_path, "data", f"feature_cov_hla_matrix_{pheno}.csv")
snps_output_path = os.path.join(
    main_path, "data", f"snps_found_{pheno}.csv")


# data info
chunksize = 10_000

patient_columns = []
with open(vcf_path, "r") as f:
    while True:
        line = f.readline()
        if not line.startswith("#"):
            break
        if line.startswith("#CHROM"):
            vcf_columns = line.split("\t")
            patient_columns = [string.split("_")[0] for string in vcf_columns[9:]]

print(patient_columns[:5], patient_columns[-5:])
num_of_people = len(patient_columns)
print(num_of_people)

In [None]:
# Choose GWAS filter threshhold
filter_snps = pd.read_csv(snp_input_path, sep=" ", comment="#", header=None)
filter_snps = list(filter_snps[3])
filter_snps[:5]

In [None]:
def feature_prep_staight(vcf_path, temporary_matrix_path, num_of_people, chunksize):
    """
    feature clean up. Processed by chuncks
    first, it saves temprary matrix coding alleles (0, 1, 2) in a format: (snps, people)
    """

    print("Processing VCF and filtering SNPs based GWAS statistics")

    # make sure the file doesn't exist, since we use append mode
    if os.path.exists(temporary_matrix_path):
        os.remove(temporary_matrix_path)
    # make sure the file doesn't exist, since we use append mode
    if os.path.exists(snps_output_path):
        os.remove(snps_output_path)

    snps_processed = 0
    total_snps_found = 0

    with pd.read_csv(vcf_path, comment='#', header=None, delim_whitespace=True,
                     chunksize=chunksize) as reader:

        for df in reader:

            column_names = ["CHROM", "POS", "ID", "REF",
                            "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
            for i in range(1, num_of_people + 1):
                # notation for a  patient "i"  is "P{i}"
                column_names.append("P" + str(i))
            df.columns = column_names

            df = df.drop(["REF", "ALT", "QUAL", "FILTER",
                         "INFO", "FORMAT"], axis=1)
            
            # dont filter, filter was applied earlier by position
            # df = df[df['ID'].isin(filter_snps)]
            if not df.empty:
                total_snps_found += df.shape[0]
                
                snp_IDs = df[["CHROM", "ID", "POS"]]  # save selected SNPs
                snp_IDs.to_csv(snps_output_path, mode='a',
                               index=False, header=False)
                
                df = df.drop(["ID", "CHROM", "POS"], axis=1)
                df = df.replace({"0/0": 0, "0/1": 1, "1/0": 1, "1/1": 2})

                df = df.astype('int8')

                df.to_csv(temporary_matrix_path, mode='a',
                          index=False, header=False)

            snps_processed += chunksize
            if snps_processed % 50_000 == 0:
                print(f"{snps_processed} SNPs processed so far")
        print(f"There are {total_snps_found} SNPs from GWAS filter")

        return total_snps_found


def feature_transpose(temporary_matrix_path, transposed_feature_matrix_path, num_of_selected_SNPs, chunksize):
    """
    Transposes the matrix, so the format is (people, snps)
    """
    if os.path.exists(transposed_feature_matrix_path):  # make sure the file doesn't exist, since we use append mode
        os.remove(transposed_feature_matrix_path)

    num_rows = 0
    num_cols = 0

    print("Transposing feature matrix")
    pbar = tqdm(total=int(num_of_selected_SNPs/chunksize),
                position=0, leave=True)

    with pd.read_csv(temporary_matrix_path, header=None, dtype='int8',
                     chunksize=chunksize) as reader:
        with open(transposed_feature_matrix_path, 'w') as trans_matrix:

            for chunk_start, df in enumerate(reader):
                chunk = df.T

                # empty tailing line
                empty_line = 2 * (num_of_selected_SNPs -
                                  (chunksize + chunksize*chunk_start))

                for i, line in chunk.iterrows():
                    new_pos = 2 * (num_of_selected_SNPs *
                                   i + chunksize*chunk_start)
                    trans_matrix.seek(new_pos)
                    trans_matrix.write(
                        f"{','.join(map(str, line))}{',' * (empty_line)}"'\n')

                pbar.update(1)

                num_rows = chunk.shape[0]
                num_cols += chunk.shape[1]

    return (num_rows, num_cols)

In [None]:
num_of_selected_SNPs = feature_prep_staight(vcf_path, temporary_matrix_path,
                                            num_of_people, chunksize)

feature_matrix_shape = feature_transpose(temporary_matrix_path, transposed_feature_matrix_path,
                                         num_of_selected_SNPs, chunksize)

In [None]:
feature_matrix_shape

## Ordering

In [None]:
# Target preparation
target = pd.read_csv(target_path, sep="\s+")
target = target[target["IID"].isin(patient_columns)]
target = target.set_index("IID")
target = target.reindex(patient_columns)
target = target.reset_index()
target.to_csv(ordered_target_path, sep="\t", index=None)

# Saving to simple csv
target = pd.read_csv(ordered_target_path, sep="\t")
target = target["bmi_gt25"]
target = target.apply(int)
print(target.value_counts())
target.to_csv(target_output_path, index=False, header=False)

In [None]:
# Covariates preparation
cov = pd.read_csv(unordered_covariates_path, sep="\s+", header=None)
cov = cov[cov[0].isin(patient_columns)]
cov = cov.set_index(0)
cov = cov.reindex(patient_columns)
cov = cov.reset_index()
# CHANGEME: leaving only gender
cov = cov[[1, 2]]
cov.to_csv(ordered_covariates_path, sep=",", index=None, header=None)

## HLA

In [None]:
hla_path = os.path.join(main_path, "data", "haplo-parsed-target-2columns.csv")
hla_output_path = os.path.join(main_path, "data", "hla_diabet.csv")

In [None]:
df = pd.read_csv(hla_path)

In [None]:
df = df.drop(["HLA-DQA1", "HLA-DQB1"], axis=1)
df = df.rename(columns={'Unnamed: 0': 'tubeid'})

In [None]:
df = df[df["tubeid"].isin(patient_columns)]
df = df.set_index("tubeid")
df = df.reindex(patient_columns)
df = df.reset_index()

In [None]:
genes = ["HLA-DQA1", "HLA-DQB1"]

In [None]:
freq_threshold = 0.00

dfs = []
for gene in genes:
    a1 = df[gene + "_1"]
    a2 = df[gene + "_2"]
    a1_a2 = pd.concat([a1, a2])
    a1_a2_vc = a1_a2.value_counts()
    print(gene, a1_a2.shape[0], a1_a2_vc)
    good_hps = list(a1_a2_vc[a1_a2_vc > a1_a2.shape[0] * freq_threshold].index)
    # print(good_hps)
    
    new_onehot_df = {}
    for k, row in df.iterrows():
        row_a1 = row[gene + "_1"]
        row_a2 = row[gene + "_2"]
        
        patient_counts = {}
        for good_hp in good_hps:
            count = 0
            if good_hp == row_a1:
                count += 1
            if good_hp == row_a2:
                count += 1
            patient_counts[good_hp] = count
        patient_counts = list(patient_counts.values())
        new_onehot_df[row["tubeid"]] = patient_counts
    new_onehot_df = pd.DataFrame(new_onehot_df)
    new_onehot_df = new_onehot_df.T
    dfs.append(new_onehot_df)

In [None]:
total_hla_df = pd.concat(dfs, axis=1)

In [None]:
total_hla_df

In [None]:
total_hla_df = total_hla_df[total_hla_df.index.isin(patient_columns)]
total_hla_df = total_hla_df.reindex(patient_columns)

In [None]:
total_hla_df

In [None]:
hla_path = "./data/hla.diab.csv"
total_hla_df.to_csv(hla_path, index=False, header=False)

## Объединение

In [None]:
## Добавим ковариаты в конец матрицы фичей
!paste {transposed_feature_matrix_path} {ordered_covariates_path} | sed 's/\t/,/' > {feature_cov_path}

In [None]:
## Добавим HLA в конец матрицы фичей
!paste {feature_cov_path} {hla_path} | sed 's/\t/,/' > {feature_cov_hla_path}