In [9]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

drugfile = '/Users/dsuveges/project_data/drugs/molecule/'

drug_df = spark.read.parquet(drugfile)
drug_df.show(n=2, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
parsed_molecules = (
    drug_df
    .select(
        F.col('id'),
        F.col('drugType'),
        F.col('synonyms'),
        F.col('crossReferences.PubChem'),
        F.col('crossReferences.drugbank')
    )
)

print(f'Number of molecules: {parsed_molecules.count()}')
print(f'Number of molecules with PubChem link: {parsed_molecules.filter(F.col("PubChem").isNotNull()).count()}')
print(f'Number of molecules with drugbank link: {parsed_molecules.filter(F.col("drugbank").isNotNull()).count()}')

Number of molecules: 13076
Number of molecules with PubChem link: 3951
Number of molecules with drugbank link: 7618


In [28]:
drug_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- canonicalSmiles: string (nullable = true)
 |-- inchiKey: string (nullable = true)
 |-- drugType: string (nullable = true)
 |-- name: string (nullable = true)
 |-- yearOfFirstApproval: long (nullable = true)
 |-- maximumClinicalTrialPhase: long (nullable = true)
 |-- parentId: string (nullable = true)
 |-- hasBeenWithdrawn: boolean (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- withdrawnNotice: struct (nullable = true)
 |    |-- countries: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- classes: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- year: long (nullable = true)
 |-- tradeNames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- crossReferences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNul

In [50]:
molecule_cross_refs = (
    drug_df
    
    # Exploding all cross references by type:
    .select(
        F.col('id'),
        F.col('drugType'),
        F.col('synonyms'),
        F.explode(F.col('crossReferences')).alias('xref_source','xref_id')
    )
    
    # Exploding all cross references by id:
    .select(
        F.col('id'),
        F.col('drugType'),
        F.col('synonyms'),
        F.col('xref_source'),
        F.explode(F.col('xref_id'))
    )
)

In [51]:
molecule_cross_refs.count()

30808

In [52]:
molecule_cross_refs.select('id').distinct().count()

9567

In [54]:
drug_df.select('id').distinct().count()

13076

In [56]:
import pandas as pd 
string_full = pd.read_csv('/Users/dsuveges/project_data/interactions/string_input/9606.protein.links.full.v11.0.txt.gz', sep=' ', compression='infer')
string_full.head()


Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,332,0,0,62,0,181,0,0,0,125,490
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,0,0,0,0,0,186,0,0,0,56,198
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,0,0,0,0,0,160,0,0,0,0,159
3,9606.ENSP00000000233,9606.ENSP00000418915,0,0,0,0,0,0,61,0,158,0,0,542,0,606
4,9606.ENSP00000000233,9606.ENSP00000327801,0,0,0,0,0,69,61,0,78,0,0,0,89,167


In [60]:
string_top = string_full.sort_values('combined_score', ascending=False).head(100)
string_top.head()

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
438799,9606.ENSP00000221265,9606.ENSP00000355013,0,0,0,0,0,102,262,893,797,900,0,470,622,999
8984196,9606.ENSP00000381840,9606.ENSP00000316578,0,0,0,0,0,0,107,970,83,900,0,709,65,999
8413679,9606.ENSP00000375942,9606.ENSP00000480158,0,0,0,0,0,0,64,930,731,900,0,649,500,999
10452741,9606.ENSP00000429374,9606.ENSP00000449328,0,167,0,0,0,978,830,784,809,900,0,213,222,999
4345675,9606.ENSP00000317123,9606.ENSP00000392094,0,0,0,0,0,323,335,998,374,900,0,836,624,999


In [85]:
print(
    string_full
    .loc[
        (string_full.protein1 == '9606.ENSP00000355013')& 
        (string_full.protein2 == '9606.ENSP00000221265')
    ]
    .to_json(indent=2, orient='records', lines=True)
)


  {
    "protein1":"9606.ENSP00000355013",
    "protein2":"9606.ENSP00000221265",
    "neighborhood":0,
    "neighborhood_transferred":0,
    "fusion":0,
    "cooccurence":0,
    "homology":0,
    "coexpression":102,
    "coexpression_transferred":262,
    "experiments":893,
    "experiments_transferred":797,
    "database":900,
    "database_transferred":0,
    "textmining":470,
    "textmining_transferred":622,
    "combined_score":999
  }



In [86]:
'''
* coexpression: 0.308
* Experiments:  0.977
* Databases:    0.900
* Textmining:   0.791
* Combined score: 0.999
    
neighborhood => 0
coexpression => 308
experiments  => 977
database     => 900
textmining   => 791
'''

'\n* coexpression: 0.308\n* Experiments: 0.977\n* Databases: 0.900\n* Textmining: 0.791\n* Combined score: 0.999\n    \nneighborhood => 0\ncoexpression => 309\nexperiments => 977\ndatabase => 900\ntextmining => 791\n'

In [82]:
coexpression = 610
coexpression_transferred = 630

p = 0.041

# c = (p+x*y -y - x)/(p-1)

# c


def bayesian_addition(prob1, prob2, prior=0.041):
    '''
    Adding two Bayesian probabilities together given a known prior applied to both
    '''
    
    if prob1 == 0:
        return prob2
    
    if prob2 == 0:
        return prob1
    
    prob1 = prob1 / 1000
    prob2 = prob2 / 1000
    
    summarized_proability = (prior + prob1 * prob2 - prob2 - prob1)/(prior-1)
    
    return int(summarized_proability*1000)

bayesian_addition(coexpression, coexpression_transferred)

849

In [87]:
scores_dict = {
    "neighborhood":0,
    "neighborhood_transferred":0,

    "coexpression":102,
    "coexpression_transferred":262,

    "experiments":893,
    "experiments_transferred":797,

    "database":900,
    "database_transferred":0,

    "textmining":470,
    "textmining_transferred":622,
}

for x in ['neighborhood', 'coexpression', 'experiments', 'database', 'textmining']:
    sum_prob = bayesian_addition(scores_dict[x], scores_dict[f'{x}_transferred'])
    print(f'{x} => {sum_prob}')

neighborhood => 0
coexpression => 308
experiments => 977
database => 900
textmining => 791


In [1]:
1000 / 60

16.666666666666668