#### Notebook to plot feature quantification by genotype allele for QTL result
ie visualize genotype's effect on feature for specific variant(s)

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
import torch
from tensorqtl import read_phenotype_bed
from tensorqtl import pgen
print('PyTorch {}'.format(torch.__version__))
from seaborn import boxenplot, stripplot, lmplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
modality = ''
day = ''
feature_name = ''
feature_id = ''
variant = ''
interaction_term = ''
chrom = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{modality}'

if interaction_term == '':
    interaction_term = None
    term_file = None
else:
    term_file = f'/labshare/raph/datasets/foundin_qtl/sample_info/foundin_{modality}_sample_info.csv'    

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
geno_dir = f'{wrk_dir}/genotypes'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'

# input files
quants_bed_file = f'{quants_dir}/{set_name}.scaled.adj.bed.gz'
pfile_prefix_path = f'{geno_dir}/{cohort}.amppdv1.{chrom}'

# output files
figure_file = f'{figures_dir}/{set_name}.{feature_name}_{variant}.png'

# variables
DEBUG = False
dpi_value = 100

if DEBUG:
    print(f'term_file = {term_file}')
    print(f'quants_bed_file = {quants_bed_file}')
    print(f'pfile_prefix_path = {pfile_prefix_path}')
    print(f'figure_file = {figure_file}')

### load plink pfiles

In [None]:
%%time
pgr = pgen.PgenReader(pfile_prefix_path)
genotype_df = pgr.load_genotypes()
variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos', 'ref', 'alt']]

#### fix the PP- to PPMI prefix

In [None]:
genotype_df = genotype_df.rename(columns=lambda x: 'PPMI' + x[len('PP-'):] if x.startswith('PP-') else x)

In [None]:
print(f'genotypes: {genotype_df.shape}')
print(f'variants: {variant_df.shape}')
if DEBUG:
    display(genotype_df.head())
    display(variant_df.head())

#### load phenotypes and covariates (if needed)

In [None]:
%%time

phenotype_df, phenotype_pos_df = read_phenotype_bed(quants_bed_file)
print(f'phenotype_df {phenotype_df.shape}')
print(f'phenotype_pos_df {phenotype_pos_df.shape}')
if DEBUG:
    display(phenotype_df.head())
    display(phenotype_pos_df.head())    

In [None]:
if not interaction_term is None:
    covs_df = read_csv(term_file, index_col=0)
    print(f'covariates_df {covs_df.shape}')
    # go ahead and subset to specified term of interest
    # have two different covariate and interaction term formats to deal with
    try:
        term_df = covs_df.loc[(covs_df['day'] == day) & 
                              (covs_df['sampleid'].isin(phenotype_df.columns)), 
                              ['day', 'sampleid', interaction_term]]
    except:
        term_df = covs_df.loc[(covs_df.index.isin(phenotype_df.columns)), 
                              [interaction_term]].copy()
        term_df['day'] = visit_name
        term_df['sampleid'] = term_df.index
    print(f'term_df {term_df.shape}')
    if DEBUG:
        display(term_df.head())
else:
    term_df = None

#### subset dataframes to relevant features and transpose

In [None]:
feature_pheno_df = phenotype_df.loc[phenotype_df.index == feature_id].T
print(feature_pheno_df.shape)
if DEBUG:
    display(feature_pheno_df.head())

In [None]:
variant_geno_df = genotype_df.loc[genotype_df.index == variant].T
print(variant_geno_df.shape)
if DEBUG:
    display(variant_geno_df.head())

### sort our if ref or alt allele is the minor allele and recode dosages with alleles for plotting

In [None]:
allele_counts = variant_geno_df[variant].value_counts()
var_info = variant_df.loc[variant]
if allele_counts[0] > allele_counts[2]:
    # alt is minor allele
    aa_geno = f'{var_info.ref}/{var_info.ref}'
    ab_geno = f'{var_info.ref}/{var_info.alt}'
    bb_geno = f'{var_info.alt}/{var_info.alt}'
else:
    # ref is minor allele
    aa_geno = f'{var_info.alt}/{var_info.alt}'
    ab_geno = f'{var_info.alt}/{var_info.ref}'
    bb_geno = f'{var_info.ref}/{var_info.ref}'
# do the alleles replacement    
variant_geno_df = variant_geno_df.replace({0: aa_geno, 1: ab_geno, 2: bb_geno})
if DEBUG:
    display(allele_counts)
    display(var_info)
    display(variant_geno_df.head())
    display(variant_geno_df[variant].value_counts())

#### merge the data frames

In [None]:
merged_df = variant_geno_df.merge(feature_pheno_df, how='inner', 
                                  left_index=True, right_index=True)
print(merged_df.shape)
if not term_df is None:
    merged_df = merged_df.merge(term_df, how='inner', left_index=True, right_on='sampleid')

print(merged_df.shape)
if DEBUG:
    display(merged_df.head())

#### generate the plot(s)

In [None]:
# plot typical quantification by genotype plot
with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')  
    boxenplot(x=variant, y=feature_id, scale='exponential', 
              data=merged_df, k_depth='trustworthy', color='purple')

    grsplt = stripplot(x=variant, y=feature_id, data=merged_df, alpha=0.75, 
                       jitter=True, color='darkgrey')
    loc, labels = plt.xticks()
    grsplt.set_xticklabels(labels)
    plt.title(f'{feature_name} quantification by {variant} genotype', fontsize='large') 
    plt.xlabel('Genotype')
    plt.ylabel('Quantification')
    plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
                transparent=True, pad_inches=1)
    plt.show()

In [None]:
if not interaction_term is None:
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
        plt.style.use('seaborn-v0_8-bright') 
        lmplot(x=interaction_term,
               y=feature_id, hue=variant, 
               palette='colorblind', height=12, data=merged_df)
        plt.title(f'{feature_name} quantification by {variant} genotype and {interaction_term} interaction', 
                  fontsize='large') 
        plt.xlabel(f'{interaction_term}')
        plt.ylabel('Quantification')
        plt.show()

In [None]:
!date