<a href="https://colab.research.google.com/github/EdH66/CMAP-Analysis/blob/main/CMAP_Query_Ranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.metrics import jaccard_score
import numpy as np


In [None]:
# Read the CSV file
url = '/content/drive/My Drive/Colab Notebooks/CMAP Drug Prediction/CMAP_Jaquard_test.csv'  # Replace with the URL of your CSV file
df = pd.read_csv(url)



In [9]:
print(df.columns)
# Print the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# Print the number of non-NA/null entries in each column
print("\nNumber of non-NA/null entries in each column:")
print(df.count())


Index(['8 weeks', 'NCS', '12 weeks', 'NCS.1', '16 weeks', 'NCS.2'], dtype='object')
First few rows of the DataFrame:
           8 weeks       NCS     12 weeks     NCS.1     16 weeks     NCS.2
0        alisertib -1.897671          IL1 -1.983262  BAY-87-2243 -1.942013
1          XMD-132 -1.826331   GALR2_GALP -1.919355    estradiol -1.931010
2  ketocholesterol -1.819495   entinostat -1.919287   aniracetam -1.879556
3      ARRY-334543 -1.804553  importazole -1.899100   endo-IWR-1 -1.877958
4       SA-1459830 -1.794416  clofarabine -1.893594      fenigam -1.864297

Number of non-NA/null entries in each column:
8 weeks      3866
NCS          3866
12 weeks    16192
NCS.1       16192
16 weeks      590
NCS.2         590
dtype: int64


In [30]:
import pandas as pd

# Assuming 'df' is your DataFrame
# Define the list of compounds
compounds_list = ['rosiglitazone', 'fluoxetine', 'alisertib', 'tomelukast', 'celastrol',
                  'pioglitazone', 'TWS-119', 'barasertib', 'clenbuterol', 'withaferin']

# Define the list columns
list_cols = ['8 weeks', '12 weeks', '16 weeks']

# Initialize an empty dictionary to store the data
compound_presence = {compound: {week: 0 for week in list_cols} for compound in compounds_list}

# Populate the dictionary with counts
for compound in compounds_list:
    for week in list_cols:
        compound_presence[compound][week] = df[week].tolist().count(compound)

# Convert the dictionary to a DataFrame for better visualization
presence_df = pd.DataFrame(compound_presence).T  # Transpose to get compounds as rows

# Display the table
print(presence_df)


               8 weeks  12 weeks  16 weeks
rosiglitazone        5         7         1
fluoxetine           1        10         1
alisertib            6        14         1
tomelukast           2         3         0
celastrol            3         8         0
pioglitazone         0         6         0
TWS-119              7         2         0
barasertib           1         8         0
clenbuterol          4         1         0
withaferin           0         0         0


Ranking methods for all three lists

In [28]:
def rank_compounds_two_lists(df, list_cols, score_cols):
    # Identify compounds present in at least two of the lists
    compounds_in_two_lists = set()
    for i, col1 in enumerate(list_cols):
        for col2 in list_cols[i+1:]:
            # Create a set for each pairwise intersection
            intersection = set(df[col1].dropna()).intersection(set(df[col2].dropna()))
            compounds_in_two_lists.update(intersection)

    # Calculate average scores for these compounds
    compound_scores = {compound: 0 for compound in compounds_in_two_lists}
    count_scores = {compound: 0 for compound in compounds_in_two_lists}  # to keep count of the scores
    for compound in compounds_in_two_lists:
        for col, score_col in zip(list_cols, score_cols):
            if compound in df[col].values:
                # Find the row index where the compound is located in the column
                row_index = df[df[col] == compound].index[0]
                # Add the score to the total score for the compound
                compound_scores[compound] += df.at[row_index, score_col]
                count_scores[compound] += 1

    # Calculate average by dividing total score by count for each compound
    for compound in compound_scores:
        if count_scores[compound] > 0:  # Avoid division by zero
            compound_scores[compound] /= count_scores[compound]

    sorted_compounds = sorted(compound_scores.items(), key=lambda x: x[1], reverse=False)[:50]

    return sorted_compounds

# Example usage
list_cols = ['8 weeks', '12 weeks', '16 weeks']
score_cols = ['NCS', 'NCS.1', 'NCS.2']
ranked_compounds = rank_compounds_two_lists(df, list_cols, score_cols)

# Display the number of entries
print(f"Number of entries in the ranked list: {len(ranked_compounds)}")

# Display the top compounds
print("Top ranked compounds present in at least two lists:")
for rank, (compound, avg_score) in enumerate(ranked_compounds, start=1):
    print(f"{rank}: {compound}, {avg_score}")



Number of entries in the ranked list: 100
Top ranked compounds present in at least two lists:
1: clofarabine, -1.740156531
2: AGK-2, -1.728138328
3: estradiol, -1.7230899333333332
4: phentermine, -1.7159402965
5: iniparib, -1.7138214115000001
6: enalapril, -1.70603323
7: importazole, -1.7032221950000002
8: cyanocobalamin, -1.6981638669999999
9: idarubicin, -1.691242814
10: withaferin-a, -1.688018918
11: flutamide, -1.6847783925000002
12: tacrolimus, -1.6843737365
13: IL1, -1.6791766285
14: arofylline, -1.677497685
15: mitoxantrone, -1.6701805594999999
16: parbendazole, -1.669451356
17: 4-NITROPYRENE, -1.6664857865
18: quizartinib, -1.666192651
19: PF-04217903, -1.6627842780000002
20: PD-0325901, -1.6626304783333332
21: YM-155, -1.6617856025
22: barasertib-HQPA, -1.6613382105
23: cimetidine, -1.6588407754999999
24: SN-38, -1.657617867
25: etodolac, -1.6574946045
26: SA-1459830, -1.6546087859999998
27: fdcyd, -1.6533697250000001
28: SB-218078, -1.65316844
29: mevastatin, -1.6529022455
30

In [29]:
def find_compound_ranks(df, list_cols, score_cols, compounds_to_find):
    # Identify compounds present in all lists
    common_compounds = set(df[list_cols[0]].dropna())
    for col in list_cols[1:]:
        common_compounds.intersection_update(set(df[col].dropna()))

    # Calculate average scores for these compounds
    compound_scores = {compound: 0 for compound in common_compounds}
    for compound in common_compounds:
        total_score = sum(df[df[col] == compound][score].iloc[0] for col, score in zip(list_cols, score_cols) if compound in df[col].values)
        compound_scores[compound] = total_score / len(list_cols)

    # Sort compounds by average score in ascending order
    sorted_compounds = sorted(compound_scores.items(), key=lambda x: x[1], reverse=False)

    # Find the ranks of specific compounds
    ranks = {}
    for compound in compounds_to_find:
        for rank, (comp, _) in enumerate(sorted_compounds, start=1):
            if comp == compound:
                ranks[compound] = rank
                break
        else:
            ranks[compound] = None  # Compound not found

    return sorted_compounds, ranks

# Example usage
list_cols = ['8 weeks', '12 weeks', '16 weeks']
score_cols = ['NCS', 'NCS.1', 'NCS.2']
compounds_to_find = ['rosiglitazone', 'fluoxetine', 'alisertib', 'tomelukast', 'celastrol', 'pioglitazone', 'TWS-119', 'barasertib', 'clenbuterol', 'withaferin']
ranked_compounds, ranks = find_compound_ranks(df, list_cols, score_cols, compounds_to_find)

# Display the ranks of specific compounds
print("Ranks of specific compounds:")
for compound, rank in ranks.items():
    print(f"{compound}: {rank}")


Ranks of specific compounds:
rosiglitazone: 60
fluoxetine: 73
alisertib: 14
tomelukast: None
celastrol: None
pioglitazone: None
TWS-119: None
barasertib: None
clenbuterol: None
withaferin: None


#Jacquard Analysis

In [None]:
# Function to calculate Jaccard similarity
def calculate_jaccard_similarity(series1, series2):
    set1 = set(series1.dropna())
    set2 = set(series2.dropna())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:  # Prevent division by zero
        return np.nan
    return len(intersection) / len(union)



In [None]:


# Extract columns
col1 = df['8 weeks']
col3 = df['12 weeks']
col5 = df['16 weeks']

# Calculate Jaccard similarities
similarity_13 = calculate_jaccard_similarity(col1, col3)
similarity_15 = calculate_jaccard_similarity(col1, col5)
similarity_35 = calculate_jaccard_similarity(col3, col5)

# Print results
print(f"Jaccard Similarity between Week 8 and Week 12: {similarity_13}")
print(f"Jaccard Similarity between Week 8 and Week 16: {similarity_15}")
print(f"Jaccard Similarity between Week 12 and Week 16: {similarity_35}")

Jaccard Similarity between Week 8 and Week 12: 0.39485087942900843
Jaccard Similarity between Week 8 and Week 16: 0.16097809475292918
Jaccard Similarity between Week 12 and Week 16: 0.11102106969205834


Attempt at weighted similarity

In [None]:
def custom_similarity(df, col_drug1, col_ncs1, col_drug2, col_ncs2):
    # Ensure NCS columns are numeric
    df[col_ncs1] = pd.to_numeric(df[col_ncs1], errors='coerce')
    df[col_ncs2] = pd.to_numeric(df[col_ncs2], errors='coerce')

    # Drop rows where either drug name or NCS is NaN
    df = df.dropna(subset=[col_drug1, col_ncs1, col_drug2, col_ncs2])

    # Creating dictionaries to pair drugs with their NCS values
    drugs_at_time1 = dict(zip(df[col_drug1], df[col_ncs1]))
    drugs_at_time2 = dict(zip(df[col_drug2], df[col_ncs2]))

    # Identifying common and unique drugs
    common_drugs = set(drugs_at_time1.keys()).intersection(set(drugs_at_time2.keys()))
    all_drugs = set(drugs_at_time1.keys()).union(set(drugs_at_time2.keys()))

    # Calculating weighted intersection and union
    intersection = sum(min(drugs_at_time1.get(drug, 0), drugs_at_time2.get(drug, 0)) for drug in all_drugs)
    union = sum(max(drugs_at_time1.get(drug, 0), drugs_at_time2.get(drug, 0)) for drug in all_drugs)

    # Computing similarity
    return intersection / union if union != 0 else 0

# Example usage
similarity_8_12 = custom_similarity(df, '8 weeks', 'NCS', '12 weeks', 'NCS')
similarity_8_16 = custom_similarity(df, '8 weeks', 'NCS', '16 weeks', 'NCS')
similarity_12_16 = custom_similarity(df, '12 weeks', 'NCS', '16 weeks', 'NCS')

print(f"Similarity between Week 8 and Week 12: {similarity_8_12}")
print(f"Similarity between Week 8 and Week 16: {similarity_8_16}")
print(f"Similarity between Week 12 and Week 16: {similarity_12_16}")



Similarity between Week 8 and Week 12: 3.143496349250033
Similarity between Week 8 and Week 16: 7.7715833689922
Similarity between Week 12 and Week 16: 13.552498471211056


Unclear how to interpret these data and also expecting 0-1 range