In [1]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from collections import defaultdict
from itertools import islice

In [2]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [3]:
#READ IN ORIGINAL CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin = pd.read_csv("Homebase.csv", header = 1)

In [4]:
#RENAMING COLUMNS
df_clin = df_clin.rename(mapper = lower_no_space, axis = 1) 
df_clin.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [5]:
df_gen = pd.read_csv("full_gen.csv") #SEE GEN_DF_SETUP

In [6]:
df_gen = df_gen[df_gen['chrom'] != 'Chromosome']
df_gen = df_gen[df_gen['gene_symbol'] != '#version 2.4'] #WEIRD OBS IN DATA
df_gen['chrom'] = df_gen['chrom'].apply(lambda x: '19' if 'gl000209' in str(x) else str(x)) #WEIRD OBS AGAIN
df_gen = df_gen.drop(columns = ['phred']) #USING ONLY RAWSCORE FOR NOW

In [7]:
df_gen['pos'] = df_gen['pos'].astype('float')
df_gen = df_gen[~df_gen['pos'].isna()]


In [8]:
# num_sections = m.ceil(np.max((df_gen['pos'].values.tolist()))/10000000)
#^^CAN USE TO DOUBLE CHECK THAT WE HAVE THE RIGHT NUMBER OF SECTIONS

df_gen['section'] = df_gen['pos'].apply(lambda x: m.ceil(x/1000000))
df_gen['section'] = df_gen['section'].astype('string')
df_gen['section'] = df_gen['section'].astype('string') + '_' + 'chrom' + '_' + df_gen['chrom']


In [9]:
#CHECK GENERAL SHAPE
df_gen.head()

Unnamed: 0.1,Unnamed: 0,gene_symbol,genome_build,chrom,pos,end_position,variant_classification,variant_type,ref,alt,amino_acid_change,sample_id,outcome,mutation,rawscore,section
0,0,GRXCR1,hg19,4,43032391.0,43032391,Missense_Mutation,SNP,C,T,P236L,SS_L1__almeida__SS,1,GRXCR1_Missense_Mutation,2.492629,44_chrom_4
1,1,KCNMA1,hg19,10,78704608.0,78704608,Missense_Mutation,SNP,T,A,"E834V,E884V,E888V,E942V,E925V,E887V",SS_L1__almeida__SS,1,KCNMA1_Missense_Mutation,4.520175,79_chrom_10
2,2,LRRC4C,hg19,11,40136362.0,40136362,Missense_Mutation,SNP,G,A,"T494I,T494I",SS_L1__almeida__SS,1,LRRC4C_Missense_Mutation,1.41389,41_chrom_11
3,3,ABCA9,hg19,17,66986044.0,66986044,Missense_Mutation,SNP,G,A,R1289W,SS_L1__almeida__SS,1,ABCA9_Missense_Mutation,4.432835,67_chrom_17
4,4,BCL7C,hg19,16,30903934.0,30903934,Nonsense_Mutation,SNP,G,A,"Q139X,Q139X",SS_L1__almeida__SS,1,BCL7C_Nonsense_Mutation,7.716384,31_chrom_16


In [10]:
# CREATING DICTIONARY TO COMPARE # OF PEOPLE W/ MUTATIONS FOR A GIVEN GENE IN EACH COHORT
genes_comp = {}

for chrom in ['11', '6', '17', '16', '5', '1', '2', '10', '7', '3', '19', '12', '4']: #chroms come from FI in classifier notebook
    for gene in set(df_gen[df_gen['chrom'] == chrom]['gene_symbol']):
        single_dict = {}
        single_dict['ss'] = len(set(df_gen[(df_gen['outcome'] == 1) & (df_gen['gene_symbol'] == gene)]['sample_id']))
        single_dict['mf'] = len(set(df_gen[(df_gen['outcome'] == 0) & (df_gen['gene_symbol'] == gene)]['sample_id']))
        single_dict['diff'] = single_dict['ss'] - single_dict['mf']
        genes_comp[gene] = single_dict

In [11]:
# ORDER BY DIFF
genes_comp_ordered = {k: v for k, v in sorted(genes_comp.items(), key=lambda item: item[1]['diff'], reverse = True)}


In [12]:
# TAKE A LOOK
genes_comp_ordered

{'OBSCN': {'ss': 28, 'mf': 1, 'diff': 27},
 'TP53': {'ss': 24, 'mf': 1, 'diff': 23},
 'IGFN1': {'ss': 22, 'mf': 1, 'diff': 21},
 'PCLO': {'ss': 22, 'mf': 3, 'diff': 19},
 'MUC4': {'ss': 20, 'mf': 1, 'diff': 19},
 'DNAH14': {'ss': 18, 'mf': 2, 'diff': 16},
 'SSPO': {'ss': 17, 'mf': 1, 'diff': 16},
 'ADGRV1': {'ss': 14, 'mf': 0, 'diff': 14},
 'MEGF6': {'ss': 14, 'mf': 0, 'diff': 14},
 'USH2A': {'ss': 17, 'mf': 3, 'diff': 14},
 'HSPG2': {'ss': 15, 'mf': 1, 'diff': 14},
 'CROCC': {'ss': 14, 'mf': 0, 'diff': 14},
 'MUC6': {'ss': 14, 'mf': 1, 'diff': 13},
 'DNAH5': {'ss': 16, 'mf': 3, 'diff': 13},
 'KIAA1614': {'ss': 13, 'mf': 0, 'diff': 13},
 'FAT4': {'ss': 15, 'mf': 2, 'diff': 13},
 'HLA-B': {'ss': 12, 'mf': 0, 'diff': 12},
 'EYS': {'ss': 13, 'mf': 1, 'diff': 12},
 'MYOM3': {'ss': 12, 'mf': 0, 'diff': 12},
 'ZNF695': {'ss': 12, 'mf': 0, 'diff': 12},
 'PCSK9': {'ss': 12, 'mf': 0, 'diff': 12},
 'PRDM16': {'ss': 15, 'mf': 3, 'diff': 12},
 'SRSF4': {'ss': 12, 'mf': 0, 'diff': 12},
 'LRP1B': {'

In [13]:
# PROPORTIONS WOULD BE BETTER
genes_comp_pct_ordered = genes_comp_ordered.copy()

for gene in genes_comp_pct_ordered.keys():
    genes_comp_pct_ordered[gene]['ss'] /= len(set(df_gen[df_gen['outcome'] == 1]['sample_id']))
    genes_comp_pct_ordered[gene]['mf'] /= len(set(df_gen[df_gen['outcome'] == 0]['sample_id']))
    genes_comp_pct_ordered[gene]['diff'] = genes_comp_pct_ordered[gene]['ss'] - genes_comp_pct_ordered[gene]['mf']

In [14]:
# SHOULDN'T NEED TO RE-SORT, BUT JUST IN CASE

genes_comp_pct_ordered = {k: v for k, v in sorted(genes_comp_pct_ordered.items(), key=lambda item: item[1]['diff'], reverse = True)}
genes_comp_pct_ordered



{'OBSCN': {'ss': 0.22950819672131148,
  'mf': 0.05263157894736842,
  'diff': 0.17687661777394306},
 'TP53': {'ss': 0.19672131147540983,
  'mf': 0.05263157894736842,
  'diff': 0.14408973252804141},
 'IGFN1': {'ss': 0.18032786885245902,
  'mf': 0.05263157894736842,
  'diff': 0.1276962899050906},
 'ADGRV1': {'ss': 0.11475409836065574, 'mf': 0.0, 'diff': 0.11475409836065574},
 'MEGF6': {'ss': 0.11475409836065574, 'mf': 0.0, 'diff': 0.11475409836065574},
 'CROCC': {'ss': 0.11475409836065574, 'mf': 0.0, 'diff': 0.11475409836065574},
 'MUC4': {'ss': 0.16393442622950818,
  'mf': 0.05263157894736842,
  'diff': 0.11130284728213977},
 'KIAA1614': {'ss': 0.10655737704918032,
  'mf': 0.0,
  'diff': 0.10655737704918032},
 'HLA-B': {'ss': 0.09836065573770492, 'mf': 0.0, 'diff': 0.09836065573770492},
 'MYOM3': {'ss': 0.09836065573770492, 'mf': 0.0, 'diff': 0.09836065573770492},
 'ZNF695': {'ss': 0.09836065573770492, 'mf': 0.0, 'diff': 0.09836065573770492},
 'PCSK9': {'ss': 0.09836065573770492, 'mf': 0

In [15]:
# MAKE A DF
impt_genes = pd.DataFrame(columns = ['gene', 'ss', 'mf', 'diff'])
for i in range(len(genes_comp_pct_ordered)):
    gene = list(genes_comp_pct_ordered.keys())[i]
    ss = genes_comp_pct_ordered[list(genes_comp_pct_ordered.keys())[i]]['ss']
    mf = genes_comp_pct_ordered[list(genes_comp_pct_ordered.keys())[i]]['mf']
    diff = genes_comp_pct_ordered[list(genes_comp_pct_ordered.keys())[i]]['diff']
    impt_genes.loc[i] = [gene, ss, mf, diff]

In [16]:
# SAVE CSV
impt_genes.to_csv('gene_cohort_comparison.csv')

note:

chi-square of OBSCN mutation rates was significant at 10%:
https://www.socscistatistics.com/tests/chisquare/default2.aspx

but not sig. for Fisher Exact Test (probs more meaningful for now):
https://www.socscistatistics.com/tests/fisher/default2.aspx

so, potentially something there with a larger sample, but for now that's conjecture