In [1]:
import os
import json
import GEOparse
import pandas as pd
from openai import OpenAI

from prompts.column_description import system_message, dataset_columns_description_prompt, dataset_input
from utils import utils
from utils import openai_utils
import prompts
from importlib import reload

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
reload(utils)
reload(openai_utils)
reload(prompts.column_description)

<module 'prompts.column_description' from '/home/bek/Desktop/iCog/semantic parsing/Bio_data_semantic_parsing/prompts/column_description.py'>

In [3]:
gse= GEOparse.get_GEO(geo="GSE12277", destdir="./data/GEO")

13-Mar-2025 11:28:42 DEBUG utils - Directory ./data/GEO already exists. Skipping.
13-Mar-2025 11:28:42 INFO GEOparse - File already exist: using local version.
13-Mar-2025 11:28:42 INFO GEOparse - Parsing ./data/GEO/GSE12277_family.soft.gz: 
13-Mar-2025 11:28:42 DEBUG GEOparse - DATABASE: GeoMiame
13-Mar-2025 11:28:42 DEBUG GEOparse - SERIES: GSE12277
13-Mar-2025 11:28:42 DEBUG GEOparse - PLATFORM: GPL570
  return read_csv(StringIO(data), index_col=None, sep="\t")
13-Mar-2025 11:28:47 DEBUG GEOparse - SAMPLE: GSM308237
13-Mar-2025 11:28:47 DEBUG GEOparse - SAMPLE: GSM308239
13-Mar-2025 11:28:47 DEBUG GEOparse - SAMPLE: GSM308241
13-Mar-2025 11:28:47 DEBUG GEOparse - SAMPLE: GSM308242
13-Mar-2025 11:28:48 DEBUG GEOparse - SAMPLE: GSM308244
13-Mar-2025 11:28:48 DEBUG GEOparse - SAMPLE: GSM308245
13-Mar-2025 11:28:48 DEBUG GEOparse - SAMPLE: GSM308246
13-Mar-2025 11:28:49 DEBUG GEOparse - SAMPLE: GSM308247
13-Mar-2025 11:28:49 DEBUG GEOparse - SAMPLE: GSM308254
13-Mar-2025 11:28:49 DEBUG 

### Column description generation for the table in GSMs

In [4]:
gsm= gse.gsms["GSM308246"]
data= gsm.table
data.head(-5)

Unnamed: 0,ID_REF,VALUE,ABS_CALL,DETECTION P-VALUE
0,AFFX-BioB-5_at,115.457000,P,0.000581
1,AFFX-BioB-M_at,119.770000,P,0.000044
2,AFFX-BioB-3_at,84.816300,P,0.000095
3,AFFX-BioC-5_at,296.351000,P,0.000052
4,AFFX-BioC-3_at,470.652000,P,0.000044
...,...,...,...,...
54665,1570631_at,12.766800,P,0.023926
54666,1570632_at,1.179860,A,0.780518
54667,1570633_at,0.159784,A,0.953857
54668,1570635_at,1.926470,A,0.753906


In [5]:
sample = data.sample(20)

In [6]:
df_string = sample.to_string()
print(df_string)


             ID_REF        VALUE ABS_CALL  DETECTION P-VALUE
28079     228212_at     2.061870        A           0.665527
38053     238191_at    40.369900        P           0.000244
33008     233142_at     8.085840        A           0.334473
50826    1561443_at     2.523220        A           0.533936
12302     212856_at   100.115000        P           0.014160
37788   237926_s_at     0.961517        A           0.432373
5811      206224_at     5.345680        A           0.334473
24360   224486_s_at    25.125700        P           0.037598
53985  1569454_a_at     1.260400        A           0.753906
5725    206138_s_at   187.482000        P           0.008057
14748   215313_x_at  2374.540000        P           0.000244
52712    1565689_at    86.047300        P           0.001221
31295     231428_at    11.025200        A           0.129639
32327     232460_at     3.171910        A           0.665527
3486      203898_at    57.353300        P           0.000244
49141    1558660_at     

In [7]:
dataset_description= "GEO is a public functional genomics data repository supporting MIAME-compliant data submissions. Array- and sequence-based data are accepted. Tools are provided to help users query and download experiments and curated gene expression profiles."

In [8]:
generation_params = {
    'temperature': 0.0,
    'max_tokens': 4096,
}

In [9]:
messages = [
    {'role': 'system', 'content': system_message},
    {'role': 'user', 'content': dataset_columns_description_prompt 
                                + dataset_input.format(dataset_description= dataset_description, dataset= df_string)}
]

In [10]:
response = openai_utils.openai_generate(messages=messages, **generation_params)
descriptions = utils.postprocess_response(response.choices[0].message.content)

In [11]:
descriptions

{'ID_REF': {'description': 'Unique identifier for each gene expression profile in the dataset.',
  'data_type': 'string'},
 'VALUE': {'description': 'Quantitative value representing the gene expression level.',
  'data_type': 'float'},
 'ABS_CALL': {'description': "Categorizes the gene expression as either 'Absent' (A) or 'Present' (P), indicating the detection of the gene in the sample.",
  'data_type': 'binary'},
 'DETECTION P-VALUE': {'description': 'Statistical measure that indicates the reliability of the gene detection. Lower values suggest higher reliability.',
  'data_type': 'float'}}

### FOL predicate generation for table entries

In [12]:
gsm_id = gsm.metadata.get("geo_accession", ["Unknown_GSE"])[0]
print(gsm_id)

GSM308246


Generate a unique ID for each row in a GSM

In [13]:

# Create a unique ID by combining row index and GSM ID
data.insert(0, "Unique_ID", data.index.astype(str) + "_" + gsm_id)

# Display the updated DataFrame
print(data.head())

     Unique_ID          ID_REF     VALUE ABS_CALL  DETECTION P-VALUE
0  0_GSM308246  AFFX-BioB-5_at  115.4570        P           0.000581
1  1_GSM308246  AFFX-BioB-M_at  119.7700        P           0.000044
2  2_GSM308246  AFFX-BioB-3_at   84.8163        P           0.000095
3  3_GSM308246  AFFX-BioC-5_at  296.3510        P           0.000052
4  4_GSM308246  AFFX-BioC-3_at  470.6520        P           0.000044


In [14]:
sample_data=data.sample(1000)
print(sample_data) 

             Unique_ID       ID_REF       VALUE ABS_CALL  DETECTION P-VALUE
53135  53135_GSM308246   1566624_at    0.050612        A           0.916260
45641  45641_GSM308246   1553441_at    0.197657        A           0.780518
16997  16997_GSM308246    217571_at    0.425002        A           0.932373
27130  27130_GSM308246    227262_at   23.011600        A           0.111572
51748  51748_GSM308246   1562972_at    0.667154        A           0.533936
...                ...          ...         ...      ...                ...
22215  22215_GSM308246     63009_at  137.339000        P           0.000219
20365  20365_GSM308246    220940_at  168.835000        P           0.000244
14367  14367_GSM308246  214931_s_at   14.485200        A           0.129639
26248  26248_GSM308246  226379_s_at   19.879400        A           0.095215
20769  20769_GSM308246    221345_at    1.733560        A           0.633789

[1000 rows x 5 columns]


In [15]:
columns = list(sample_data.columns)
print("Extracted Columns:", columns)


Extracted Columns: ['Unique_ID', 'ID_REF', 'VALUE', 'ABS_CALL', 'DETECTION P-VALUE']


In [16]:
prompt = f"""
    You are a data standardization expert. Convert the following dataset column names into standardized predicate names:
    {columns}
    Return a JSON dictionary where keys are original column names and values are standardized predicate names.
    """

In [17]:
messages =[
    {'role':'system', 'content':"You are an expert in data standardization."},
    {'role':'user', 'content':prompt}
]

In [18]:
response = openai_utils.openai_generate(messages=messages, **generation_params)
predicate_mapping = response.choices[0].message.content

In [19]:
print("Predicate Mapping:", predicate_mapping)

Predicate Mapping: {
    "Unique_ID": "uniqueId",
    "ID_REF": "idRef",
    "VALUE": "value",
    "ABS_CALL": "absCall",
    "DETECTION P-VALUE": "detectionPValue"
}


In [20]:
predicate_mapping = json.loads(predicate_mapping)

In [21]:
def generate_fol_predicates(row, predicate_mapping):
    """Convert a row of data into FOL predicates."""
    unique_id = row["Unique_ID"]  # Use Unique_ID as the main entity
    predicates = [f'GSM_Measurement({unique_id})']  # Main predicate

    # Convert each column into a predicate
    for col, predicate in predicate_mapping.items():
        value = row[col]
        formatted_value = value if pd.notna(value) else "null"

         # Exclude uniqueId(Unique_ID, Unique_ID)
        if predicate == "uniqueId" and formatted_value == unique_id:
            continue
        
        predicates.append(f'{predicate}({unique_id}, {formatted_value})')

    return ",\n    ".join(predicates) +","

In [22]:
# Generate predicates for each row
fol_predicates = [generate_fol_predicates(row, predicate_mapping) for _, row in sample_data.iterrows()]

# Print results
for pred in fol_predicates:
    print(pred)

GSM_Measurement(53135_GSM308246),
    idRef(53135_GSM308246, 1566624_at),
    value(53135_GSM308246, 0.0506116),
    absCall(53135_GSM308246, A),
    detectionPValue(53135_GSM308246, 0.91626),
GSM_Measurement(45641_GSM308246),
    idRef(45641_GSM308246, 1553441_at),
    value(45641_GSM308246, 0.197657),
    absCall(45641_GSM308246, A),
    detectionPValue(45641_GSM308246, 0.780518),
GSM_Measurement(16997_GSM308246),
    idRef(16997_GSM308246, 217571_at),
    value(16997_GSM308246, 0.425002),
    absCall(16997_GSM308246, A),
    detectionPValue(16997_GSM308246, 0.932373),
GSM_Measurement(27130_GSM308246),
    idRef(27130_GSM308246, 227262_at),
    value(27130_GSM308246, 23.0116),
    absCall(27130_GSM308246, A),
    detectionPValue(27130_GSM308246, 0.111572),
GSM_Measurement(51748_GSM308246),
    idRef(51748_GSM308246, 1562972_at),
    value(51748_GSM308246, 0.667154),
    absCall(51748_GSM308246, A),
    detectionPValue(51748_GSM308246, 0.533936),
GSM_Measurement(35263_GSM308246),
    

In [23]:
with open("./outputs/FOL_predicates.txt", "w") as file:
    file.write("\n".join(fol_predicates))

### Generating FOL predicates that represent the relationship between GSE and GSM

In [24]:
gse.metadata

{'title': ['Hematopoietic Progenitor Cells of Different Donor Age'],
 'geo_accession': ['GSE12277'],
 'status': ['Public on Jun 12 2009'],
 'submission_date': ['Jul 29 2008'],
 'last_update_date': ['Mar 25 2019'],
 'pubmed_id': ['19513108'],
 'summary': ['In this series we have analyzed the effect of donor age on the gene expression profile of human hematopoietic stem and progenitor cells (HPC). Cells were taken from umbilical cord blood (CB) or from G-CSF mobilized blood of healthy donors for allogeneic blood stem cell transplantation.'],
 'overall_design': ['Hematopoietic progenitor cells (HPC) were isolated within the CD34+ cell fraction from fresh human cord blood (CB) or from G-CSF mobilized peripheral blood (PB) as described in detail before (Wagner et al., Blood, 2004, 104:675-684; Wagner et al., Stem cells, 2005, 23:1180-1191; Wagner et al., 2007, 10:2638-2657). Differential gene expression was subsequently compared in the CD34+ samples of different donor age.'],
 'type': ['Exp

In [25]:
def generate_fol_relationship(metadata):
    """
    Generates First-Order Logic (FOL) representation of the GSE-GSM relationship.

    Args:
        metadata (dict): Dictionary containing GSE metadata.

    Returns:
        str: FOL statements representing GSE-GSM relationships.
    """
    gse_id = metadata.get("geo_accession", ["Unknown_GSE"])[0]  # Get GSE ID
    gsm_ids = metadata.get("sample_id", [])  # Get GSM IDs list

    fol_statements = [f"has_sample({gse_id}, {gsm});" for gsm in gsm_ids]  # Create FOL statements

    return "\n".join(fol_statements)

In [26]:
gse_metadata= gse.metadata

In [27]:
fol_output = generate_fol_relationship(gse_metadata)

In [28]:
print("Example FOL Output:")
print(fol_output)

Example FOL Output:
has_sample(GSE12277, GSM308237);
has_sample(GSE12277, GSM308239);
has_sample(GSE12277, GSM308241);
has_sample(GSE12277, GSM308242);
has_sample(GSE12277, GSM308244);
has_sample(GSE12277, GSM308245);
has_sample(GSE12277, GSM308246);
has_sample(GSE12277, GSM308247);
has_sample(GSE12277, GSM308254);
has_sample(GSE12277, GSM308255);
has_sample(GSE12277, GSM308317);
has_sample(GSE12277, GSM308461);
has_sample(GSE12277, GSM308462);
has_sample(GSE12277, GSM308463);
has_sample(GSE12277, GSM308465);
has_sample(GSE12277, GSM308467);
has_sample(GSE12277, GSM308469);
has_sample(GSE12277, GSM308470);
has_sample(GSE12277, GSM308473);


##### Generating Metta synaxed predicates that represent the relationship between GSE and GSM

In [41]:
def generate_fol_relationship_metta(metadata):
    declaration= f"""(= (has_sample $GSE $GSM)
    (add-atom &self ($GSE ((GSM $GSM))))
)\n"""
    
    gse_id = metadata.get("geo_accession", ["Unknown_GSE"])[0]  # Get GSE ID
    gsm_ids = metadata.get("sample_id", [])  # Get GSM IDs list

    fol_statements = [f"!( has_sample {gse_id} {gsm});" for gsm in gsm_ids]  # Create FOL statements

    return declaration + "\n".join(fol_statements)


In [43]:
metta_fol_output = generate_fol_relationship_metta(gse_metadata)

print("Example FOL Output:\n")
print(metta_fol_output)

Example FOL Output:

(= (has_sample $GSE $GSM)
    (add-atom &self ($GSE ((GSM $GSM))))
)
!( has_sample GSE12277 GSM308237);
!( has_sample GSE12277 GSM308239);
!( has_sample GSE12277 GSM308241);
!( has_sample GSE12277 GSM308242);
!( has_sample GSE12277 GSM308244);
!( has_sample GSE12277 GSM308245);
!( has_sample GSE12277 GSM308246);
!( has_sample GSE12277 GSM308247);
!( has_sample GSE12277 GSM308254);
!( has_sample GSE12277 GSM308255);
!( has_sample GSE12277 GSM308317);
!( has_sample GSE12277 GSM308461);
!( has_sample GSE12277 GSM308462);
!( has_sample GSE12277 GSM308463);
!( has_sample GSE12277 GSM308465);
!( has_sample GSE12277 GSM308467);
!( has_sample GSE12277 GSM308469);
!( has_sample GSE12277 GSM308470);
!( has_sample GSE12277 GSM308473);


### PLN formatted predicate generation

In [31]:
def declare_types(predicate_mapping):
    """Generate PLN type declarations based on predicate_mapping."""
    pln_types = [f"(: {pln_type} (-> Object Type))" for pln_type in predicate_mapping.values()]
    return "\n".join(pln_types)

In [32]:
def declare_instances(df, predicate_mapping):
    """Generate PLN instance declarations with relationships to uniqueId."""
    pln_instances = []
    
    for _, row in df.iterrows():
        unique_id = row["Unique_ID"]  # Unique identifier for this row
        
        # for col in df.columns:
        #     if col in predicate_mapping:  # Ensure only mapped columns are used
        #         mapped_col = predicate_mapping[col]
        #         instance = row[col]
                
        #         # Declare the instance type
        #         pln_instances.append(f"(: {instance} {mapped_col})")
        
        # pln_instances.append(f"\n") # Add newline for separation

        for col in df.columns:
            if col in predicate_mapping:  # Ensure only mapped columns are used
                mapped_col = predicate_mapping[col]
                instance = row[col]        
                
                if mapped_col == "uniqueId":
                    continue
                # Define relationship with uniqueId
                pln_instances.append(f"(: ({mapped_col} {unique_id} {instance}))")
        pln_instances.append(f"\n") # Add newline for separation
    
    return "\n".join(pln_instances)

In [33]:
type_declarations = declare_types(predicate_mapping)
print("Type Declarations:")
print(type_declarations)

Type Declarations:
(: uniqueId (-> Object Type))
(: idRef (-> Object Type))
(: value (-> Object Type))
(: absCall (-> Object Type))
(: detectionPValue (-> Object Type))


In [34]:
PLN_predicates = declare_instances(sample_data, predicate_mapping)
print(PLN_predicates)

(: (idRef 53135_GSM308246 1566624_at))
(: (value 53135_GSM308246 0.0506116))
(: (absCall 53135_GSM308246 A))
(: (detectionPValue 53135_GSM308246 0.91626))


(: (idRef 45641_GSM308246 1553441_at))
(: (value 45641_GSM308246 0.197657))
(: (absCall 45641_GSM308246 A))
(: (detectionPValue 45641_GSM308246 0.780518))


(: (idRef 16997_GSM308246 217571_at))
(: (value 16997_GSM308246 0.425002))
(: (absCall 16997_GSM308246 A))
(: (detectionPValue 16997_GSM308246 0.932373))


(: (idRef 27130_GSM308246 227262_at))
(: (value 27130_GSM308246 23.0116))
(: (absCall 27130_GSM308246 A))
(: (detectionPValue 27130_GSM308246 0.111572))


(: (idRef 51748_GSM308246 1562972_at))
(: (value 51748_GSM308246 0.667154))
(: (absCall 51748_GSM308246 A))
(: (detectionPValue 51748_GSM308246 0.533936))


(: (idRef 35263_GSM308246 235401_s_at))
(: (value 35263_GSM308246 6.09012))
(: (absCall 35263_GSM308246 A))
(: (detectionPValue 35263_GSM308246 0.165771))


(: (idRef 5574_GSM308246 205986_at))
(: (value 5574_GSM308246

In [35]:
with open("./outputs/PLN_predicates.txt", "w") as file:
    file.write(type_declarations + "\n\n" + PLN_predicates)

### Convert to metta atom

In [36]:
def declare_predicate_mapping(predicate_mapping):
    """Generate PLN row structure using predicate_mapping."""
    placeholders = " ".join(f"${value}" for key, value in predicate_mapping.items() if key != "Unique_ID")  # Exclude uniqueId
    mappings = " ".join(f"({value} ${value})" for key, value in predicate_mapping.items() if key != "Unique_ID")  # Exclude uniqueId

    return f"""(= (row $uniqueId {placeholders})
    (add-atom &self ($uniqueId ({mappings})))
)\n"""


In [37]:
def declare_instances(df, predicate_mapping):
    """Generate PLN instance declarations for each row."""
    pln_instances = []
    
    for _, row in df.iterrows():
        values = " ".join(str(row[col]) for col in predicate_mapping if col in df.columns)  # Extracting values
        pln_instances.append(f"!(row {values})")
    
    return "\n".join(pln_instances)

In [38]:
row_definition= declare_predicate_mapping(predicate_mapping)
print(row_definition)

(= (row $uniqueId $idRef $value $absCall $detectionPValue)
    (add-atom &self ($uniqueId ((idRef $idRef) (value $value) (absCall $absCall) (detectionPValue $detectionPValue))))
)



In [39]:
instances= declare_instances(sample_data, predicate_mapping)
print(instances)

!(row 53135_GSM308246 1566624_at 0.0506116 A 0.91626)
!(row 45641_GSM308246 1553441_at 0.197657 A 0.780518)
!(row 16997_GSM308246 217571_at 0.425002 A 0.932373)
!(row 27130_GSM308246 227262_at 23.0116 A 0.111572)
!(row 51748_GSM308246 1562972_at 0.667154 A 0.533936)
!(row 35263_GSM308246 235401_s_at 6.09012 A 0.165771)
!(row 5574_GSM308246 205986_at 66.5689 M 0.0461426)
!(row 16872_GSM308246 217446_x_at 72.2646 P 0.000244141)
!(row 588_GSM308246 200999_s_at 67.6351 P 0.00585938)
!(row 48622_GSM308246 1557688_at 18.4612 P 0.0239258)
!(row 6045_GSM308246 206458_s_at 2.04803 A 0.592773)
!(row 9170_GSM308246 209619_at 1951.85 P 0.000244141)
!(row 20628_GSM308246 221204_s_at 12.8425 A 0.0952148)
!(row 19480_GSM308246 220055_at 1.204 A 0.696289)
!(row 35403_GSM308246 235541_at 13.3916 A 0.067627)
!(row 40760_GSM308246 240898_at 0.77625 A 0.696289)
!(row 50359_GSM308246 1560769_at 0.873884 A 0.780518)
!(row 38589_GSM308246 238727_at 5.16244 A 0.27417)
!(row 32645_GSM308246 232779_at 13.6453 P

In [40]:
with open("./outputs/Metta.txt", "w") as file:
    file.write(row_definition + "\n\n" + instances)