In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import anndata 
import my_describes

In [2]:
sum_EGAD = pd.read_csv('../data/EAGD/EGAD_sum.csv')
bulk_EGAD = pd.read_csv('../data/EAGD/EGAD_bulk.csv')
avg_EGAD = pd.read_csv('../data/EAGD/EGAD.csv')

In [3]:
sum_EGAD

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0002250,0.859971,5307.379533,0.216926
GO:0045087,0.694435,7381.768526,0.408815
GO:0071918,0.455685,5677.674758,0.234813
GO:0006955,0.756960,6516.232412,0.307746
GO:0050830,0.729795,5711.458055,0.270138
...,...,...,...
GO:1902983,0.500000,8849.766000,0.473873
GO:0033316,0.500000,7707.735000,0.390301
GO:0031445,0.500000,10482.470000,0.634693
GO:0031282,0.500000,10126.800000,0.578442


In [4]:
bulk_EGAD

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0002250,0.799935,494.519817,0.238547
GO:0045087,0.670182,893.203658,0.379591
GO:0071918,0.823148,76.504642,0.144572
GO:0006955,0.754855,536.131997,0.249467
GO:0050830,0.721034,501.303302,0.290244
...,...,...,...
GO:1902983,,,
GO:0033316,0.500000,1424.382744,0.466725
GO:0031445,,,
GO:0031282,0.500000,2858.328857,0.909190


In [5]:
avg_EGAD

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0002250,0.698722,-36.622420,0.337597
GO:0045087,0.572079,4.684482,0.470169
GO:0071918,0.535595,-83.280470,0.421289
GO:0006955,0.642805,-21.098645,0.404564
GO:0050830,0.649765,3.562382,0.342652
...,...,...,...
GO:1902983,0.500000,205.923307,0.718325
GO:0033316,0.500000,7.085241,0.036387
GO:0031445,0.500000,-213.266224,0.733596
GO:0031282,0.500000,41.515715,0.188797


# Filter for top 10% GO annotations

In [6]:
def filter_by_percentile(df:pd.DataFrame, by:str, percentage:float, ascending:bool):

    n_row = len(df)

    return df.sort_values(by = by, ascending = ascending).head(int(n_row*percentage))



In [7]:
sum_top_GO = filter_by_percentile(sum_EGAD, by = 'auc', percentage=0.1, ascending=False)

In [8]:
sum_top_GO

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0050817,0.999901,1901.794000,0.012945
GO:0044278,0.999483,1867.432000,0.014289
GO:0072272,0.998986,5275.135000,0.189733
GO:0030221,0.998827,2609.022000,0.050259
GO:0070640,0.998707,3252.896000,0.062250
...,...,...,...
GO:0034089,0.867589,10532.870000,0.791510
GO:0090179,0.867588,7896.030000,0.469490
GO:0060160,0.867538,5189.463000,0.190709
GO:0062043,0.867518,8405.508000,0.443861


In [9]:
bulk_top_GO = filter_by_percentile(bulk_EGAD, by = 'auc', percentage=0.1, ascending=False)

In [10]:
bulk_top_GO

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0048627,0.998616,-3.331421,0.002077
GO:0055096,0.995155,59.076677,0.045574
GO:0001661,0.992350,13.711224,0.028743
GO:0035434,0.992058,18.947792,0.025082
GO:1905704,0.991913,66.507726,0.026284
...,...,...,...
GO:0007035,0.774930,694.754570,0.359726
GO:2001045,0.774681,2212.761897,0.729508
GO:0010739,0.774637,688.331265,0.262027
GO:0018057,0.774486,1580.598302,0.525552


In [11]:
avg_top_GO = filter_by_percentile(avg_EGAD, by = 'auc', percentage=0.1, ascending=False)

In [12]:
avg_top_GO

Unnamed: 0,auc,avg_node_degree,degree_null_auc
GO:0070476,0.997992,1.521736,0.009455
GO:0030573,0.996838,9.090693,0.044741
GO:0060621,0.996600,-4.832343,0.047605
GO:0008315,0.995466,-2.246772,0.050468
GO:0034370,0.995188,-2.141337,0.035495
...,...,...,...
GO:0072673,0.775550,-7.953564,0.471587
GO:0044806,0.775401,101.669619,0.466984
GO:0007494,0.775397,117.012913,0.440711
GO:1904708,0.775319,180.555845,0.499463


# Retieve GO Terms

In [13]:
sum_GO = np.array(sum_top_GO.index)

In [14]:
sum_GO

array(['GO:0050817', 'GO:0044278', 'GO:0072272', ..., 'GO:0060160',
       'GO:0062043', 'GO:0060350'], dtype=object)

In [15]:
bulk_GO = np.array(bulk_top_GO.index)

In [16]:
bulk_GO

array(['GO:0048627', 'GO:0055096', 'GO:0001661', ..., 'GO:0010739',
       'GO:0018057', 'GO:0060336'], dtype=object)

In [17]:
avg_GO = np.array(avg_top_GO.index)

In [18]:
avg_GO

array(['GO:0070476', 'GO:0030573', 'GO:0060621', ..., 'GO:0007494',
       'GO:1904708', 'GO:0051884'], dtype=object)

# Check for overlap


In [19]:
def get_overlap(array1, array2):
    """
    Returns the terms shared among array1 array2.
    
    Like an inner join for arrays
    """
    return list(set(array1).intersection(array2))

In [20]:
avg_sum_overlap = get_overlap(avg_GO, sum_GO)

In [21]:
sum_bulk_overlap = get_overlap(sum_GO, bulk_GO)

In [22]:
bulk_avg_overlap = get_overlap(bulk_GO, avg_GO)

In [23]:
print(len(avg_sum_overlap))
print(len(sum_bulk_overlap))
print(len(bulk_avg_overlap))


594
340
355


# Correlations


## put aucs in df

In [53]:
auc_df = pd.DataFrame(sum_EGAD.loc[:,'auc']).rename(columns={'auc':'sum_auc'})

In [54]:
auc_df

Unnamed: 0,sum_auc
GO:0002250,0.859971
GO:0045087,0.694435
GO:0071918,0.455685
GO:0006955,0.756960
GO:0050830,0.729795
...,...
GO:1902983,0.500000
GO:0033316,0.500000
GO:0031445,0.500000
GO:0031282,0.500000


In [58]:
auc_df = auc_df.join(pd.DataFrame(avg_EGAD.loc[:,'auc'])).rename(columns={'auc':'avg_auc'})

In [60]:
auc_df = auc_df.join(pd.DataFrame(bulk_EGAD.loc[:,'auc'])).rename(columns={'auc':'bulk_auc'})

## Remove NANs

In [35]:
def remove_nans(array:np.array):
    return array[~np.isnan(array)]

In [61]:
row = auc_df.iloc[0,:]

In [None]:
def remove_nan_rows(auc_df:pd.DataFrame):
    for i, row in enumerate(auc_df):
        print(i)
        print(row)

In [65]:
sum(row.isna()) >  0

False

In [36]:
len(sum_auc)

12535

In [38]:
sum_auc = sum_auc[~np.isnan(sum_auc)]
len(sum_auc)

12464

In [40]:
def remove_nans(array:np.array):
    return array[~np.isnan(array)]

In [41]:
test = remove_nans(sum_auc)

In [42]:
test

GO:0002250    0.859971
GO:0045087    0.694435
GO:0071918    0.455685
GO:0006955    0.756960
GO:0050830    0.729795
                ...   
GO:1902983    0.500000
GO:0033316    0.500000
GO:0031445    0.500000
GO:0031282    0.500000
GO:0009396    0.500000
Name: auc, Length: 12464, dtype: float64

In [25]:
np.corrcoef(sum_EGAD.loc[:,'auc'], avg_EGAD.loc[:,'auc'], )

array([[nan, nan],
       [nan, nan]])

In [27]:
np.corrcoef(sum_EGAD.loc[:,'auc']

nan