In [None]:
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from pycaret.classification import *
import numpy as np
import seaborn as sns
from sklearn.model_selection import cross_val_score, RepeatedKFold, StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tsfresh import extract_features, select_features
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu,ttest_ind
from statsmodels.stats.multitest import multipletests
import gzip
import io
from matplotlib.patches import Circle

In [None]:
balanced = True

In [None]:
gene_matrix = pd.read_csv('FinalData/SFARI_LDfirst_mutation_matrix_normalised_rnd_noXY.txt', sep=',')
gene_matrix_non_norm = pd.read_csv('FinalData/SFARI_LDfirst_mutation_matrix_noXY.txt', sep=',')
snp_matrix = pd.read_csv('FinalData/small_LDpruned_allSNPs5e-5_training_inc_noID.raw', sep=' ')
prs_matrix = pd.read_csv('FinalData/all_CG_PRS_SNPs.raw', sep=' ')

In [None]:
#Split gene data into 56:24:20
y = gene_matrix['PHENOTYPE']
X = gene_matrix.drop(columns=['PHENOTYPE', 'SEX', 'IID'])
val = []
for i in range(len(y)):
    if y[i] != 1 and y[i] != 2:
        val.append(i)
y = y.drop(val)
X = X.drop(val)
gene_matrix = gene_matrix.drop(val)
y = [pheno-1 for pheno in y]
X = X.fillna(0)
gene_matrix = gene_matrix.fillna(0)

X_trainG, X_testG, y_trainG, y_testG = train_test_split(X,y,test_size=0.2, random_state=10)
X_train2G, X_test2G, y_train2G, y_test2G = train_test_split(X_trainG,y_trainG,test_size=0.3, random_state=10)
trainingG = X_train2G.copy()
trainingG['PHENOTYPE'] = y_train2G
testG = X_test2G.copy()
testG['PHENOTYPE'] = y_test2G

unseen_testG = X_testG.copy()
unseen_testG['PHENOTYPE'] = y_testG

In [None]:
# subsample the males, if balanced
if balanced:
    trainingG_aut = trainingG[trainingG['PHENOTYPE'] == 1]
    trainingG_neuro = trainingG[trainingG['PHENOTYPE'] != 1]  # Keep data with other phenotypes

    males_aut = trainingG_aut[gene_matrix.loc[trainingG_aut.index, 'SEX'] == 1]
    females_aut = trainingG_aut[gene_matrix.loc[trainingG_aut.index, 'SEX'] == 2]
    n_males = len(males_aut)
    males_aut_balanced = males_aut.sample(n=len(females_aut), random_state=10)
    balanced_aut_data = pd.concat([males_aut_balanced, females_aut])
    balanced_trainingG = pd.concat([balanced_aut_data, trainingG_neuro])

In [None]:
## Replace '.' names from Fisher SNPs with the position
traw = pd.read_csv('FinalData/small_LDpruned_allSNPs5e-5_training_inc_noID.traw', sep = '\t')

mapping = {}
for colno in range(len(snp_matrix.columns)):
    base_name = snp_matrix.columns[colno].split('_')[0]  
    if base_name == '.':
        new_name = f"{traw['CHR'].values[colno-6]}_{traw['POS'].values[colno-6]}_{traw['COUNTED'].values[colno-6]}"
    else:
        new_name = snp_matrix.columns[colno]
    
    mapping[snp_matrix.columns[colno]] = new_name
snp_matrix.rename(columns=mapping, inplace=True)

# Split SNP data into 56:24:20
snp_y = snp_matrix['PHENOTYPE']
snp_X = snp_matrix.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'PHENOTYPE', 'SEX'])
val = []
for i in range(len(snp_y)):
    if snp_y[i] != 1 and snp_y[i] != 2:
        val.append(i)
snp_y = snp_y.drop(val)
snp_X = snp_X.drop(val)
snp_matrix = snp_matrix.drop(val)
snp_y = [pheno-1 for pheno in snp_y]
snp_X = snp_X.fillna(0)

X_trainS, X_testS, y_trainS, y_testS = train_test_split(snp_X,snp_y,test_size=0.2, random_state=10)
X_train2S, X_test2S, y_train2S, y_test2S = train_test_split(X_trainS,y_trainS, test_size=0.3, random_state=10)
trainingS = X_train2S.copy()
trainingS['PHENOTYPE'] = y_train2S
testS = X_test2S.copy()
testS['PHENOTYPE'] = y_test2S

unseen_testS = X_testS.copy()
unseen_testS['PHENOTYPE'] = y_testS

if balanced:
    balanced_trainingS = trainingS.loc[balanced_trainingG.index]

In [None]:
# Split PRS data into 56:24:20
prs_y = prs_matrix['PHENOTYPE']
prs_X = prs_matrix.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'PHENOTYPE'])
val = []
for i in range(len(prs_y)):
    if prs_y[i] != 1 and prs_y[i] != 2:
        val.append(i)
prs_y = prs_y.drop(val)
prs_X = prs_X.drop(val)
prs_matrix = prs_matrix.drop(val)
prs_y = [pheno-1 for pheno in prs_y]
prs_X = prs_X.fillna(0)

X_trainP, X_testP, y_trainP, y_testP = train_test_split(prs_X,prs_y,test_size=0.2, random_state=10)
X_train2P, X_test2P, y_train2P, y_test2P = train_test_split(X_trainP,y_trainP, test_size=0.3, random_state=10)
trainingP = X_train2P.copy()
trainingP['PHENOTYPE'] = y_train2P
testP = X_test2P.copy()
testP['PHENOTYPE'] = y_test2P

unseen_testP = X_testP.copy()
unseen_testP['PHENOTYPE'] = y_testP

if balanced:
    balanced_trainingP = trainingP.loc[balanced_trainingG.index]

In [None]:
# create rf model from gene data and get predictive scores
gene = ClassificationExperiment()
if balanced:
    gene.setup(balanced_trainingG, target = 'PHENOTYPE', session_id=10, test_data = testG)
else:
    gene.setup(trainingG, target = 'PHENOTYPE', session_id=10, test_data = testG)
rf = gene.create_model('rf')
tuned_rf = gene.tune_model(rf, optimize = 'F1')
pred_gene = gene.predict_model(tuned_rf)
pred_test_gene = gene.predict_model(tuned_rf, data = unseen_testG)
pred_train_gene = gene.predict_model(tuned_rf, data=trainingG)

In [None]:
# select best features from Fisher SNP dataset using MWU on training data
X_train_mwu = select_features(X_train2S, np.array(y_train2S),test_for_binary_target_real_feature='mann', test_for_real_target_binary_feature='mann', fdr_level=0.05)
X_test_mwu = X_test2S[X_test2S.columns.intersection(X_train_mwu.columns)]
X_test_mwu = X_test_mwu.reindex(sorted(X_test_mwu.columns), axis=1)
X_train_mwu = X_train_mwu.reindex(sorted(X_train_mwu.columns), axis=1)

trainingS = X_train_mwu
trainingS['PHENOTYPE'] = y_train2S
testS = X_test_mwu
testS['PHENOTYPE'] = y_test2S

if balanced:
    balanced_trainingS = balanced_trainingS[trainingS.columns]

# create rf model from fisher SNP dataset and get predictive scores
snp = ClassificationExperiment()
if balanced:
    snp.setup(balanced_trainingS, target = 'PHENOTYPE', session_id=10, test_data = testS)
else:
    snp.setup(trainingS, target = 'PHENOTYPE', session_id=10, test_data = testS)
rfS = snp.create_model('rf')
tuned_rfS = snp.tune_model(rfS, optimize = 'F1')
pred_snp = snp.predict_model(tuned_rfS)
pred_test_snp = snp.predict_model(tuned_rfS, data=unseen_testS)
pred_train_snp = snp.predict_model(tuned_rfS, data=trainingS)

In [None]:
# create rf model from PRS dataset and get predictive scores
prs = ClassificationExperiment()
if balanced:
    prs.setup(balanced_trainingP, target = 'PHENOTYPE', session_id=10, test_data = testP)
else:
    prs.setup(trainingP, target = 'PHENOTYPE', session_id=10, test_data = testP)
rfP = prs.create_model('rf')
tuned_rfP = prs.tune_model(rfP, optimize = 'F1')
pred_prs = prs.predict_model(tuned_rfP)
pred_test_prs = prs.predict_model(tuned_rfP, data=unseen_testP)
pred_train_prs = prs.predict_model(tuned_rfP, data=trainingP)

In [None]:
# create datasets for ensemble model made up of the predictive scores from the previous three rf and biological sex
#dataset that will be used for ensemble training
comb_pred = pd.DataFrame()
pred_gene = pred_gene.reset_index(drop=True)
comb_pred['gene'] = pred_gene['prediction_score']
gene_predictions = pred_gene['prediction_label']
for i in range(len(gene_predictions)):
  if gene_predictions[i] == 0:
    comb_pred['gene'][i] = comb_pred['gene'][i]*-1 # if predicted neurotypical, make score negative

pred_snp = pred_snp.reset_index(drop=True)
comb_pred['snp'] = pred_snp['prediction_score']
snp_predictions = pred_snp['prediction_label']
for i in range(len(snp_predictions)):
  if snp_predictions[i] == 0:
    comb_pred['snp'][i] = comb_pred['snp'][i]*-1

pred_prs = pred_prs.reset_index(drop=True)
comb_pred['prs'] = pred_prs['prediction_score']
prs_predictions = pred_prs['prediction_label']
for i in range(len(prs_predictions)):
  if prs_predictions[i] == 0:
    comb_pred['prs'][i] = comb_pred['prs'][i]*-1

comb_pred['SEX'] = pred_prs['SEX']
comb_pred['PHENOTYPE'] = pred_snp['PHENOTYPE']
#dataset for unseen test
comb_test = pd.DataFrame()
pred_test_gene = pred_test_gene.reset_index(drop=True)
comb_test['gene'] = pred_test_gene['prediction_score']
gene_predictionsT = pred_test_gene['prediction_label']
for i in range(len(gene_predictionsT)):
  if gene_predictionsT[i] == 0:
   comb_test['gene'][i] = comb_test['gene'][i]*-1

pred_test_snp = pred_test_snp.reset_index(drop=True)
comb_test['snp'] = pred_test_snp['prediction_score']
snp_predictionsT = pred_test_snp['prediction_label']
for i in range(len(snp_predictionsT)):
  if snp_predictionsT[i] == 0:
    comb_test['snp'][i] = comb_test['snp'][i]*-1

pred_test_prs = pred_test_prs.reset_index(drop=True)
comb_test['prs'] = pred_test_prs['prediction_score']
prs_predictionsT = pred_test_prs['prediction_label']
for i in range(len(prs_predictionsT)):
  if prs_predictionsT[i] == 0:
    comb_test['prs'][i] = comb_test['prs'][i]*-1

comb_test['SEX'] = pred_test_prs['SEX']
comb_test['PHENOTYPE'] = pred_test_snp['PHENOTYPE']

#dataset that the og models were trained on
comb_train = pd.DataFrame()
pred_train_gene = pred_train_gene.reset_index(drop=True)
comb_train['gene'] = pred_train_gene['prediction_score']
gene_predictions = pred_train_gene['prediction_label']
for i in range(len(gene_predictions)):
  if gene_predictions[i] == 0:
    comb_train['gene'][i] = comb_train['gene'][i]*-1

pred_train_snp = pred_train_snp.reset_index(drop=True)
comb_train['snp'] = pred_train_snp['prediction_score']
snp_predictions = pred_train_snp['prediction_label']
for i in range(len(snp_predictions)):
  if snp_predictions[i] == 0:
    comb_train['snp'][i] = comb_train['snp'][i]*-1

pred_train_prs = pred_train_prs.reset_index(drop=True)
comb_train['prs'] = pred_train_prs['prediction_score']
prs_predictions = pred_train_prs['prediction_label']
for i in range(len(prs_predictions)):
  if prs_predictions[i] == 0:
    comb_train['prs'][i] = comb_train['prs'][i]*-1

comb_train['SEX'] = pred_train_prs['SEX']
comb_train['PHENOTYPE'] = pred_train_snp['PHENOTYPE']

In [None]:
# train and predict using ensemble model
comb = ClassificationExperiment()
comb.setup(comb_pred, target = 'PHENOTYPE', session_id=10, test_data = comb_test, index=False)
rfC = comb.create_model('rf')
tuned_rfC = comb.tune_model(rfC, optimize = 'F1')
predC = comb.predict_model(tuned_rfC)
predC_test = comb.predict_model(tuned_rfC, data=comb_pred)
predC_train = comb.predict_model(tuned_rfC, data=comb_train)

In [None]:
# how well does it predict for each sex?
predC['SEX'] = snp_matrix['SEX']
aut_pred = predC[predC['prediction_label'] == 1]
aut_corr = aut_pred[aut_pred['PHENOTYPE']==1]
print('Males predicted autistic:\t',len(aut_pred[aut_pred['SEX'] == 1]))
print('What percentage of these are correct:\t', len(aut_corr[aut_corr['SEX']==1])/len(aut_pred[aut_pred['SEX'] == 1]))
print('Females predicted autistic:',len(aut_pred[aut_pred['SEX'] == 2]))
print('What percentage of these are correct:\t', len(aut_corr[aut_corr['SEX']==2])/len(aut_pred[aut_pred['SEX'] == 2]))

naut_pred = predC[predC['prediction_label'] == 0]
naut_corr = naut_pred[naut_pred['PHENOTYPE']==0]
print('Males predicted neurotypical:\t',len(naut_pred[naut_pred['SEX'] == 1]))
print('What percentage of these are correct:\t', len(naut_corr[naut_corr['SEX']==1])/len(naut_pred[naut_pred['SEX'] == 1]))
print('Females predicted neurotypical:',len(naut_pred[naut_pred['SEX'] == 2]))
print('What percentage of these are correct:\t', len(naut_corr[naut_corr['SEX']==2])/len(naut_pred[naut_pred['SEX'] == 2]))

In [None]:
# get shap values for overall model
save = False
explainer = shap.Explainer(tuned_rfC)
X_test_comb = comb_test.drop(columns='PHENOTYPE')
shap_values = explainer.shap_values(X_test_comb)
shap.summary_plot(shap_values[:,:,1], X_test_comb, show=False)
ax = plt.gca()
ax.set_xlim(-0.28,0.17)
if save:
    plt.savefig('shap_summary.png', dpi=700)
plt.show()

In [None]:
# dependence plot of impact of biological snp on fisher SNP
save = False
shap.dependence_plot("snp", shap_values[:,:,1], X_test_comb,interaction_index="SEX", show=False)
ax = plt.gca()
ax.set_xlim(-1,1)
ax.set_ylim(-0.28,0.18)
if save:
    plt.savefig('snp_sex_dependence.png', dpi=700)
plt.show()

In [None]:
## Find the locations of all the SNPs in SNP matrix. Since some are '.' need to download the matrix with the locations
positions = pd.read_csv('FinalData/plink.assoc1711LD0.5_small.fisher', delim_whitespace=True)
chosen_snps = pd.read_csv('FinalData/small_LDpruned_allSNPs5e-5_training_inc_noID.txt', sep = '\t', header=None)

chosen_snps = chosen_snps.drop(columns=[2,3])
chosen_snps.columns = ['chrom', 'bp']
chosen_snps = pd.merge(chosen_snps, positions, left_on=['chrom', 'bp'], right_on=['CHR', 'BP'], how='left').drop(columns=['chrom', 'bp', 'A1', 'F_A', 'F_U', 'A2', 'P', 'OR'])

for rowno in range(len(chosen_snps)):
    if chosen_snps['SNP'][rowno] == '.':
        chosen_snps['SNP'][rowno] = f"{chosen_snps['CHR'][rowno]}_{chosen_snps['BP'][rowno]}"

# Get shap scores for SNPs
explainerS = shap.Explainer(tuned_rfS)
X_test_S = testS.drop(columns=['PHENOTYPE'])
shap_valuesS = explainerS.shap_values(X_test_S)
mean_shap_feature_values = pd.DataFrame(shap_valuesS[:,:,1], columns=X_test_S.columns).abs().mean(axis=0).sort_values(ascending=False)

In [None]:
# Combine shap scores with chr_bp df
shap_scoresS = pd.DataFrame(mean_shap_feature_values).reset_index().rename(columns={'index':'snp', 0:'shap'})
shap_scoresS['snp_clean'] = shap_scoresS['snp'].str.rsplit('_', n=1).str[0]

def replace_with_chr_bp(row):
    if row.startswith('rs'):
        match = chosen_snps[chosen_snps['SNP'] == row]
        if not match.empty:
            chr_value = match.iloc[0]['CHR']
            bp_value = match.iloc[0]['BP']
            return f"{chr_value}_{bp_value}"
    return row
shap_scoresS['snp_clean'] = shap_scoresS['snp_clean'].apply(replace_with_chr_bp)

# separate into sex and plot the ten most contributory features
pos_class = shap_valuesS[:,:,1]
shap_df = pd.DataFrame(pos_class, columns=X_test_S.columns)
shap_df['sex'] = testP['SEX'].values

shap_female = shap_df[shap_df['sex'] == 2].drop('sex', axis=1)
shap_male = shap_df[shap_df['sex'] == 1].drop('sex', axis=1) 

mean_shap_female = shap_female.abs().mean()
mean_shap_male = shap_male.abs().mean()

shap_comparison = pd.DataFrame({
    'female': mean_shap_female,
    'male': mean_shap_male
})

top_n = 10
shap_comparison.sort_values(by='male', ascending=False).head(top_n).plot(kind='bar', figsize=(10, 6))
plt.title(f'Top {top_n} Feature Importance Comparison (Female vs Male)')
plt.ylabel('Mean |SHAP Value|')
plt.show()

In [None]:
save = False
# Calculate top 10% contributors for male and female separately. Plot where they are 
mean_shap_female = mean_shap_female.sort_values(ascending=False)
mean_shap_male = mean_shap_male.sort_values(ascending=False)
shap_female_top10 = mean_shap_female[:int(len(mean_shap_female)/10)]
shap_male_top10 = mean_shap_male[:int(len(mean_shap_male)/10)]
# Which are same and different?
common_top10 = set(shap_female_top10.index.values.tolist()) & set(shap_male_top10.index.values.tolist())
fem_only_top10 = set(shap_female_top10.index.values.tolist()) - set(shap_male_top10.index.values.tolist())
mal_only_top10 = set(shap_male_top10.index.values.tolist()) - set(shap_female_top10.index.values.tolist())
def top10_pos(top10, typ):
    top10 = pd.DataFrame(top10)
    top10['Phenotype'] = typ
    top10 = top10.rename(columns={0: 'SNP'})
    top10['SNP'] = top10['SNP'].str.rsplit('_', n=1).str[0]
    top10 = top10.merge(chosen_snps, how='inner', on='SNP')
    return(top10)
common_top10 = top10_pos(common_top10,'Shared')
fem_only_top10 = top10_pos(fem_only_top10,'Female Only')
mal_only_top10 = top10_pos(mal_only_top10, 'Male Only')
all_top10_shap = pd.concat([fem_only_top10, mal_only_top10, common_top10])
all_top10_shap = all_top10_shap.reset_index(drop=True)
if save:
    all_top10_shap.to_csv('top10percent_snps_from_model.txt', sep = '\t', index=False)
##Phenogram made from this using https://visualization.ritchielab.org/phenograms/create

From here, only done for balanced model

In [None]:
# Check for sig difference for rs58741612_A
t_stat, p_value = ttest_ind(abs(shap_female['rs58741612_A']), abs(shap_male['rs58741612_A']), equal_var=False)
print(f"T-test p-value: {p_value}")

In [None]:
# Identify SNPs in chr16p11 then check for sig locus
snp1 = [col for col in shap_female.columns if col.startswith('rs73545273')][0]
snp2 = [col for col in shap_female.columns if col.startswith('16_34145852')][0]
snp3 = [col for col in shap_female.columns if col.startswith('16_34120022')][0]

fem_sum = abs(shap_female[snp1] + shap_female[snp2] + shap_female[snp3])
mal_sum = abs(shap_male[snp1] + shap_male[snp2] + shap_male[snp3])

# is there a difference in contribution for males vs females?
t_stat, p_value = ttest_ind(fem_sum, mal_sum, equal_var=False)
print(f"T-test p-value: {p_value}")

Analysing gene mutation rates

In [None]:
def sig_genes(genes_aut, genes_neuro):
    gene_names = genes_aut.columns
    mwu = {}

    for gene in gene_names:
        stat, p_value = mannwhitneyu(genes_aut[gene], genes_neuro[gene], alternative='two-sided')
        mwu[gene] = {"statistic": stat, "p_val": p_value}

    mwu_res = pd.DataFrame(mwu).T

    reject, pvals_corrected, _, _ = multipletests(mwu_res['p_val'], method='fdr_bh')

    mwu_res["p_val_corr"] = pvals_corrected
    mwu_res["sig_corr"] = reject
    return(mwu_res)

def loeuf_score_calc(aut_genes, neuro_genes):
    all_mwu_res = sig_genes(aut_genes, neuro_genes)
    signif_genes = all_mwu_res[all_mwu_res['sig_corr']==True]
    insignif_genes = all_mwu_res[all_mwu_res['sig_corr']==False]
    print(len(signif_genes))

    signif_genes = signif_genes.join(loeuf_scores[['oe_lof_upper','chromosome']], how='left')
    signif_genes = signif_genes[~signif_genes['oe_lof_upper'].isna()]
    insignif_genes = insignif_genes.join(loeuf_scores[['oe_lof_upper','chromosome']], how='left')
    insignif_genes = insignif_genes[~insignif_genes['oe_lof_upper'].isna()]
    return(signif_genes, insignif_genes)

In [None]:
# LOEUF scores downloaded from gnomad
file_path = 'FinalData/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz'
with gzip.open(file_path, 'rt') as f:  
        loeuf_scores = pd.read_csv(f, sep='\t') 
loeuf_scores = loeuf_scores.set_index(['gene'])            

In [None]:
# including X-chrom, get all the gene info from both datasets
illuminagenes = pd.read_csv('FinalData/Illumina_SFARI_mutation_matrix.txt', sep=',')
CG_genes = pd.read_csv('FinalData/SFARIgene_mutation_matrix.txt', sep=',')
CG_genes = CG_genes.drop(columns=['Unnamed: 0'])
both_genes = pd.concat([illuminagenes, CG_genes], join = 'inner')
both_genes = both_genes.reset_index(drop=True)

all_genes_aut = both_genes[both_genes['PHENOTYPE']==2]
all_genes_neuro = both_genes[both_genes['PHENOTYPE']==1]
all_genes_aut_fem = all_genes_aut[all_genes_aut['SEX']==2]
all_genes_aut_mal = all_genes_aut[all_genes_aut['SEX']==1]
all_genes_neuro_fem = all_genes_neuro[all_genes_neuro['SEX']==2]
all_genes_neuro_mal = all_genes_neuro[all_genes_neuro['SEX']==1]

matrices = [all_genes_aut_fem, all_genes_aut_mal, all_genes_neuro_fem, all_genes_neuro_mal]
matrices = [df.drop(columns=['PHENOTYPE', 'SEX', 'IID']) for df in matrices]
all_genes_aut_fem, all_genes_aut_mal, all_genes_neuro_fem, all_genes_neuro_mal = matrices

mal_sig_genes, mal_insig_genes = loeuf_score_calc(all_genes_aut_mal, all_genes_neuro_mal)
fem_sig_genes, fem_insig_genes = loeuf_score_calc(all_genes_aut_fem, all_genes_neuro_fem)

In [None]:
# plot a violin plot of the distribution of genes that are significant (male or female) and those that aren't
def loeuf_plot(signif_genes, insignif_genes, typ):
    stat, pval_loeuf = mannwhitneyu(signif_genes['oe_lof_upper'], insignif_genes['oe_lof_upper'], alternative='two-sided')

    insignif_genes['group'] = 'Insignificant genes'
    signif_genes['group'] = 'Significant genes'

    genes_loeuf = pd.concat([insignif_genes, signif_genes])

    sns.violinplot(x='group', y='oe_lof_upper', data=genes_loeuf, inner='box')
    sns.boxplot(x='group', y='oe_lof_upper', data=genes_loeuf, width=0.1, showcaps=True, 
            boxprops={'facecolor':'none', 'edgecolor':'black'},
            whiskerprops={'color':'black', 'linewidth':2},       
            capprops={'color':'black', 'linewidth':2},           
            medianprops={'color':'black', 'linewidth':2},       
            saturation=0.75)
    plt.title('LOEUF scores of SFARI genes' + ' - ' + typ)
    plt.xlabel('Genes')
    plt.ylabel('LOEUF scores')
    plt.show()
    return(pval_loeuf)

In [None]:
loeuf_plot(mal_sig_genes, mal_insig_genes, 'male')
loeuf_plot(fem_sig_genes, fem_insig_genes, 'female')

In [None]:
# find which genes are both male and female or just one
fm_gene_overlap = set(fem_sig_genes.index.values.tolist()) & set(mal_sig_genes.index.values.tolist())
fm_only_gene = set(fem_sig_genes.index.values.tolist()) - set(mal_sig_genes.index.values.tolist())
ml_only_gene = set(mal_sig_genes.index.values.tolist()) - set(fem_sig_genes.index.values.tolist())
both_genes = fem_sig_genes[~fem_sig_genes.index.isin(fm_only_gene)]
mal_only_genes = mal_sig_genes.loc[list(ml_only_gene)]
fem_only_genes = fem_sig_genes.loc[list(fm_only_gene)]

In [None]:
# save the significant genes for ABAEnrichment analysis in R
save = False
if save:
    fem_sig_genes.index.to_series().to_csv('ABA_enrichment/female_sig_SFARI_genes.txt', index=False, header=False)
    mal_sig_genes.index.to_series().to_csv('ABA_enrichment/male_sig_SFARI_genes.txt', index=False, header=False)
    all_sig_genes.index.to_series().to_csv('ABA_enrichment/all_sig_SFARI_genes.txt', index=False, header=False)

In [None]:
# Get shap scores for genes
explainerG = shap.Explainer(tuned_rf)
X_test_G= testG.drop(columns=['PHENOTYPE'])
shap_valuesG = explainerG.shap_values(X_test_G)
shap_class_valuesG = shap_valuesG[:, :, 1]
mean_shap_feature_valuesG = pd.DataFrame(shap_class_valuesG, columns=X_test_G.columns)
mean_shap_feature_valuesG = mean_shap_feature_valuesG.abs().mean(axis=0).sort_values(ascending=False)
mean_shap_feature_valuesG = mean_shap_feature_valuesG.reset_index()
mean_shap_feature_valuesG.columns = ['gene', 'SHAP']

In [None]:
# Compare males and females gene shap values
pos_classG = shap_valuesG[:,:,1]
shapG = pd.DataFrame(pos_classG, columns=X_test_G.columns)
shapG['sex'] = testP['SEX'].values

shap_femaleG = shapG[shapG['sex'] == 2].drop('sex', axis=1)
shap_maleG = shapG[shapG['sex'] == 1].drop('sex', axis=1)  
mean_shap_femaleG = shap_femaleG.abs().mean(axis=0).sort_values(ascending=False)
mean_shap_femaleG = mean_shap_femaleG.reset_index()
mean_shap_femaleG.columns = ['gene', 'SHAP']

mean_shap_maleG = shap_maleG.abs().mean(axis=0).sort_values(ascending=False)
mean_shap_maleG = mean_shap_maleG.reset_index()
mean_shap_maleG.columns = ['gene', 'SHAP']

#plot the top 5 contributing genes
mean_shap_femaleGenes = shap_femaleG.abs().mean()
mean_shap_maleGenes = shap_maleG.abs().mean()

shap_comparisonG = pd.DataFrame({
    'female': mean_shap_femaleGenes,
    'male': mean_shap_maleGenes
})

top_n = 5
shap_comparisonG.sort_values(by='male', ascending=False).head(top_n).plot(kind='bar', figsize=(10, 6))
plt.title(f'Top {top_n} Contributing Genes (Female vs Male)')
plt.ylabel('Mean abs(SHAP Value)')
save = False
if save:
    plt.savefig('top_genes_balanced.png')
plt.show()