### Notebook
- Analysis of MHC genotype frequencies (in Intro)

In [1]:
import pandas as pd
from scipy.stats import pearsonr

### 1. Load data

##### 1A. Load MHC types

In [7]:
mhcI_types = pd.read_csv('../data/confidently_typed_patients.class_i.csv', index_col=0)
mhcII_types = pd.read_csv('../data/confidently_typed_patients.class_ii.csv', index_col=0)

##### 1B. Load patients 
- Remove sex-specific cancers for sex analyses

In [2]:
MSS_only = 'MSS_only'
remove_sex_specific = 'remove_sex_specific.'

In [3]:
confident_patients_rm_sex_I = pd.read_csv('../generated_data/confident_patient_I.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)
confident_patients_rm_sex_II = pd.read_csv('../generated_data/confident_patient_II.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)

# set age threshold
young_thresh = 52 # 30th percentile: 52, 40th: 57
old_thresh = 68   # 70th percentile: 68, 60th: 65

confident_patients_rm_sex_I['age_categorical_3070'] = pd.cut(confident_patients_rm_sex_I['age'], [0, young_thresh, old_thresh, confident_patients_rm_sex_I['age'].max()], labels=['younger', 'middle', 'older'])
confident_patients_rm_sex_II['age_categorical_3070'] = pd.cut(confident_patients_rm_sex_II['age'], [0, young_thresh, old_thresh, confident_patients_rm_sex_II['age'].max()], labels=['younger', 'middle', 'older'])

##### 1C. Load patients
- Keep sex-specific cancers for age analyses

In [4]:
MSS_only = 'MSS_only'
remove_sex_specific = 'kept_sex_specific.'

In [5]:
confident_patients_kept_sex_I = pd.read_csv('../generated_data/confident_patient_I.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)
confident_patients_kept_sex_II = pd.read_csv('../generated_data/confident_patient_II.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)

# set age threshold
young_thresh = 52 # 30th percentile: 52, 40th: 57
old_thresh = 68   # 70th percentile: 68, 60th: 65

confident_patients_kept_sex_I['age_categorical_3070'] = pd.cut(confident_patients_kept_sex_I['age'], [0, young_thresh, old_thresh, confident_patients_kept_sex_I['age'].max()], labels=['younger', 'middle', 'older'])
confident_patients_kept_sex_II['age_categorical_3070'] = pd.cut(confident_patients_kept_sex_II['age'], [0, young_thresh, old_thresh, confident_patients_kept_sex_II['age'].max()], labels=['younger', 'middle', 'older'])

### 2. Format data

##### 2A. Sex-specific analyses 
- Remove sex-specific cancers

In [8]:
# subset types
mhcI_types_rm_sex = mhcI_types[mhcI_types.index.isin(confident_patients_rm_sex_I.index)]
mhcII_types_rm_sex = mhcII_types[mhcII_types.index.isin(confident_patients_rm_sex_II.index)]
print(len(mhcI_types_rm_sex), len(mhcII_types_rm_sex))

2554 2681


##### 2B. Age-specific analyses 
- Keep sex-specific cancers

In [9]:
# subset types
mhcI_types_kept_sex = mhcI_types[mhcI_types.index.isin(confident_patients_kept_sex_I.index)]
mhcII_types_kept_sex = mhcII_types[mhcII_types.index.isin(confident_patients_kept_sex_II.index)]
print(len(mhcI_types_kept_sex), len(mhcII_types_kept_sex))

3166 3036


### 3. Compare allele frequency for each group

In [10]:
# get female/male patients
female_I = confident_patients_rm_sex_I[confident_patients_rm_sex_I['sex']=='female'].index.unique()
male_I = confident_patients_rm_sex_I[confident_patients_rm_sex_I['sex']=='male'].index.unique()

female_II = confident_patients_rm_sex_II[confident_patients_rm_sex_II['sex']=='female'].index.unique()
male_II = confident_patients_rm_sex_II[confident_patients_rm_sex_II['sex']=='male'].index.unique()

# get young/old patients
young_I = confident_patients_kept_sex_I[confident_patients_kept_sex_I['age_categorical_3070']=='younger'].index.unique()
old_I = confident_patients_kept_sex_I[confident_patients_kept_sex_I['age_categorical_3070']=='older'].index.unique()

young_II = confident_patients_kept_sex_II[confident_patients_kept_sex_II['age_categorical_3070']=='younger'].index.unique()
old_II = confident_patients_kept_sex_II[confident_patients_kept_sex_II['age_categorical_3070']=='older'].index.unique()

In [11]:
# subset MHC types for sex
female_I_types = mhcI_types_rm_sex[mhcI_types_rm_sex.index.isin(female_I)].values.reshape(-1).tolist()
male_I_types = mhcI_types_rm_sex[mhcI_types_rm_sex.index.isin(male_I)].values.reshape(-1).tolist()

female_II_types = mhcII_types_rm_sex[mhcII_types_rm_sex.index.isin(female_I)].values.reshape(-1).tolist()
male_II_types = mhcII_types_rm_sex[mhcII_types_rm_sex.index.isin(male_I)].values.reshape(-1).tolist()

# subset MHC types for age
young_I_types = mhcI_types_kept_sex[mhcI_types_kept_sex.index.isin(young_I)].values.reshape(-1).tolist()
old_I_types = mhcI_types_kept_sex[mhcI_types_kept_sex.index.isin(old_I)].values.reshape(-1).tolist()

young_II_types = mhcII_types_kept_sex[mhcII_types_kept_sex.index.isin(young_I)].values.reshape(-1).tolist()
old_II_types = mhcII_types_kept_sex[mhcII_types_kept_sex.index.isin(old_I)].values.reshape(-1).tolist()

In [12]:
# get frequencies for sex
male_I_freq = pd.Series(male_I_types, name='male_freq')
male_I_freq = pd.DataFrame(male_I_freq.value_counts() / len(male_I_freq))

female_I_freq = pd.Series(female_I_types, name='female_freq')
female_I_freq = pd.DataFrame(female_I_freq.value_counts() / len(female_I_freq))

male_II_freq = pd.Series(male_II_types, name='male_freq')
male_II_freq = pd.DataFrame(male_II_freq.value_counts() / len(male_II_freq))

female_II_freq = pd.Series(female_II_types, name='female_freq')
female_II_freq = pd.DataFrame(female_II_freq.value_counts() / len(female_II_freq))

In [13]:
# get frequencies for age
old_I_freq = pd.Series(old_I_types, name='old_freq')
old_I_freq = pd.DataFrame(old_I_freq.value_counts() / len(old_I_freq))

young_I_freq = pd.Series(young_I_types, name='young_freq')
young_I_freq = pd.DataFrame(young_I_freq.value_counts() / len(young_I_freq))

old_II_freq = pd.Series(old_II_types, name='old_freq')
old_II_freq = pd.DataFrame(old_II_freq.value_counts() / len(old_II_freq))

young_II_freq = pd.Series(young_II_types, name='young_freq')
young_II_freq = pd.DataFrame(young_II_freq.value_counts() / len(young_II_freq))

In [14]:
# get outputs - sex
mhcI_output_df = pd.merge(male_I_freq, female_I_freq, left_index=True, right_index=True)
mhcII_output_df = pd.merge(male_II_freq, female_II_freq, left_index=True, right_index=True)

print('MHC-I: Sex')
print(pearsonr(mhcI_output_df['male_freq'], 
               mhcI_output_df['female_freq']))

print('MHC-II: Sex')
print(pearsonr(mhcII_output_df['male_freq'], 
               mhcII_output_df['female_freq']))

MHC-I: Sex
(0.9910125920979792, 1.4104563651723024e-114)
MHC-II: Sex
(0.9977357306032135, 2.848642999557856e-127)


In [15]:
# get outputs - age
mhcI_output_df = pd.merge(old_I_freq, young_I_freq, left_index=True, right_index=True)
mhcII_output_df = pd.merge(old_II_freq, young_II_freq, left_index=True, right_index=True)

print('MHC-I: Age')
print(pearsonr(mhcI_output_df['old_freq'], 
               mhcI_output_df['young_freq']))

print('MHC-II: Age')
print(pearsonr(mhcII_output_df['old_freq'], 
               mhcII_output_df['young_freq']))

MHC-I: Age
(0.9841260895416479, 3.2026728001264255e-94)
MHC-II: Age
(0.9976420118620409, 3.587760253578673e-118)
