In [10]:
import pandas as pd
from scipy.stats import pearsonr

### 1. Import confident patients

In [6]:
MSS_only = 'MSS_only'
remove_sex_specific = 'remove_sex_specific.'
use_expression = True
remove_homozygous_patients = ''# 'rm_homozygous_patients.'

In [32]:
confident_patients_I = pd.read_csv('/cellar/users/andreabc/Data/hla_sex/generated_data/confident_patient_I.age_sex_disease.{}.{}txt'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)
confident_patients_II = pd.read_csv('/cellar/users/andreabc/Data/hla_sex/generated_data/confident_patient_II.age_sex_disease.{}.{}txt'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)

# set age threshold
young_thresh = 52 # 30th percentile: 52, 40th: 57
old_thresh = 68   # 70th percentile: 68, 60th: 65

confident_patients_I['age_categorical_3070'] = pd.cut(confident_patients_I['age'], [0, young_thresh, old_thresh, confident_patients_I['age'].max()], labels=['younger', 'middle', 'older'])
confident_patients_II['age_categorical_3070'] = pd.cut(confident_patients_II['age'], [0, young_thresh, old_thresh, confident_patients_II['age'].max()], labels=['younger', 'middle', 'older'])

### 2. Import MHC types

In [2]:
mhcI_types = pd.read_csv('/cellar/users/andreabc/Data/TCGA/hla_types/all_mhc_i_types.txt', sep='\t', index_col=0)
mhcII_types = pd.read_csv('/cellar/users/andreabc/Data/TCGA/hla_types/all_mhc_ii_types.rachel.txt', sep='\t', index_col=0)

In [13]:
# subset types
mhcI_types = mhcI_types[mhcI_types.index.isin(confident_patients_I.index)]
mhcII_types = mhcII_types[mhcII_types.index.isin(confident_patients_II.index)]
print(len(mhcI_types), len(mhcII_types))

2554 2681


### 3. Check allele frequency
- for sex and age

In [35]:
# get female/male patients
female_I = confident_patients_I[confident_patients_I['sex']=='female'].index.unique()
male_I = confident_patients_I[confident_patients_I['sex']=='male'].index.unique()

female_II = confident_patients_II[confident_patients_II['sex']=='female'].index.unique()
male_II = confident_patients_II[confident_patients_II['sex']=='male'].index.unique()

# get young/old patients
young_I = confident_patients_I[confident_patients_I['age_categorical_3070']=='younger'].index.unique()
old_I = confident_patients_I[confident_patients_I['age_categorical_3070']=='older'].index.unique()

young_II = confident_patients_II[confident_patients_II['age_categorical_3070']=='younger'].index.unique()
old_II = confident_patients_II[confident_patients_II['age_categorical_3070']=='older'].index.unique()

In [36]:
# subset MHC types for sex
female_I_types = mhcI_types[mhcI_types.index.isin(female_I)].values.reshape(-1).tolist()
male_I_types = mhcI_types[mhcI_types.index.isin(male_I)].values.reshape(-1).tolist()

female_II_types = mhcII_types[mhcII_types.index.isin(female_I)].values.reshape(-1).tolist()
male_II_types = mhcII_types[mhcII_types.index.isin(male_I)].values.reshape(-1).tolist()

# subset MHC types for age
young_I_types = mhcI_types[mhcI_types.index.isin(young_I)].values.reshape(-1).tolist()
old_I_types = mhcI_types[mhcI_types.index.isin(old_I)].values.reshape(-1).tolist()

young_II_types = mhcII_types[mhcII_types.index.isin(young_I)].values.reshape(-1).tolist()
old_II_types = mhcII_types[mhcII_types.index.isin(old_I)].values.reshape(-1).tolist()

In [37]:
# get frequencies for sex
male_I_freq = pd.Series(male_I_types, name='male_freq')
male_I_freq = pd.DataFrame(male_I_freq.value_counts() / len(male_I_freq))

female_I_freq = pd.Series(female_I_types, name='female_freq')
female_I_freq = pd.DataFrame(female_I_freq.value_counts() / len(female_I_freq))

male_II_freq = pd.Series(male_II_types, name='male_freq')
male_II_freq = pd.DataFrame(male_II_freq.value_counts() / len(male_II_freq))

female_II_freq = pd.Series(female_II_types, name='female_freq')
female_II_freq = pd.DataFrame(female_II_freq.value_counts() / len(female_II_freq))

In [38]:
# get frequencies for age
old_I_freq = pd.Series(old_I_types, name='old_freq')
old_I_freq = pd.DataFrame(old_I_freq.value_counts() / len(old_I_freq))

young_I_freq = pd.Series(young_I_types, name='young_freq')
young_I_freq = pd.DataFrame(young_I_freq.value_counts() / len(young_I_freq))

old_II_freq = pd.Series(old_II_types, name='old_freq')
old_II_freq = pd.DataFrame(old_II_freq.value_counts() / len(old_II_freq))

young_II_freq = pd.Series(young_II_types, name='young_freq')
young_II_freq = pd.DataFrame(young_II_freq.value_counts() / len(young_II_freq))

In [44]:
# get outputs - sex
mhcI_output_df = pd.merge(male_I_freq, female_I_freq, left_index=True, right_index=True)
mhcII_output_df = pd.merge(male_II_freq, female_II_freq, left_index=True, right_index=True)

print('MHC-I: Sex')
print(pearsonr(mhcI_output_df['male_freq'], 
               mhcI_output_df['female_freq']))

print('MHC-II: Sex')
print(pearsonr(mhcII_output_df['male_freq'], 
               mhcII_output_df['female_freq']))

MHC-I: Sex
(0.9903178691861415, 4.445861410177758e-115)
MHC-II: Sex
(0.995529185579491, 3.450166374983716e-297)


In [45]:
# get outputs - age
mhcI_output_df = pd.merge(old_I_freq, young_I_freq, left_index=True, right_index=True)
mhcII_output_df = pd.merge(old_II_freq, young_II_freq, left_index=True, right_index=True)

print('MHC-I: Age')
print(pearsonr(mhcI_output_df['old_freq'], 
               mhcI_output_df['young_freq']))

print('MHC-II: Age')
print(pearsonr(mhcII_output_df['old_freq'], 
               mhcII_output_df['young_freq']))

MHC-I: Age
(0.9822583531682022, 4.348509930088831e-89)
MHC-II: Age
(0.9946494005261352, 2.017693232470517e-253)
