In [2]:
import csv
import pandas as pd

In [2]:
# Column Headers for original predication and predication aux files. 
predication_headers = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 'SUBJECT_CUI',
    'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'FACT_VALUE_CHAR',
    'MOD_SCALE_CHAR', 'MOD_VALUE_FLOAT'
]

predication_aux_headers = [
    'PREDICATION_AUX_ID', 'PREDICATION_ID', 'SUBJECT_TEXT', 'SUBJECT_DIST',
    'SUBJECT_MAXDIST', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE',
    'INDICATOR_TYPE', 'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'OBJECT_TEXT',
    'OBJECT_DIST', 'OBJECT_MAXDIST', 'OBJECT_START_INDEX', 'OBJECT_END_INDEX',
    'OBJECT_SCORE', 'CURR_TIMESTAMP'
]

# Creating Nodes
The following code will create two CSVs for entity and predication nodes. 

In [None]:
# Read the original CSV files
df = pd.read_csv('semmed_data/predication.csv', names=predication_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N'])

df_aux = pd.read_csv('semmed_data/predication_aux.csv', names=predication_aux_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N']) 

In [None]:
df.head(5)

In [None]:
df_aux.head(5)

In [4]:
merged_df = pd.merge(df, df_aux, on='PREDICATION_ID', how='inner')

In [None]:
# Create predication_df from both main and aux dataframes
predication_base_columns = ['PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE',
                          'SUBJECT_CUI', 'OBJECT_CUI']
predication_aux_columns = ['PREDICATION_ID', 'INDICATOR_TYPE', 
                         'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX']

# Get base predication info
predication_df = df[predication_base_columns].copy()

# Get auxiliary info and merge
aux_info = df_aux[predication_aux_columns].copy()
predication_df = predication_df.merge(aux_info, on='PREDICATION_ID', how='left')
aux_info = aux_info.rename(columns={'PREDICATION_ID': 'PREDICATION_ID:ID'})
predication_df = predication_df.drop_duplicates()

In [None]:
predication_df

In [7]:
# Save predication_df to CSV with each field in quotes
predication_df.to_csv('data/predication.csv', index=False, quoting=csv.QUOTE_ALL, header=False)

In [4]:
# Create concept_df from main and aux dataframes
subject_base_columns = ['PREDICATION_ID', 'SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY']
subject_base = df[subject_base_columns].copy()

# Subject columns from aux df
subject_aux_columns = ['PREDICATION_ID', 'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 
                      'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE']
subject_aux = df_aux[subject_aux_columns].copy()

In [5]:
# Object columns from predication df
object_base_columns = ['PREDICATION_ID', 'OBJECT_CUI', 'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY']
object_base = df[object_base_columns].copy()

# Object columns from aux df
object_aux_columns = ['PREDICATION_ID', 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST', 
                     'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE']
object_aux = df_aux[object_aux_columns].copy()

In [6]:
# Merge base and aux for subjects and objects
subject_entities = subject_base.merge(subject_aux, on='PREDICATION_ID').drop_duplicates()

In [7]:
object_entities = object_base.merge(object_aux, on='PREDICATION_ID').drop_duplicates()

In [None]:
# Rename columns to prepare for merging
concept_columns = ['CUI:ID', 'NAME', 'SEMTYPE', 'NOVELTY', 'TEXT', 
                  'DIST', 'MAXDIST', 'START_INDEX', 'END_INDEX', 'SCORE']
subject_entities.columns = concept_columns
object_entities.columns = concept_columns

In [None]:
# Combine subject and object entities and remove duplicates based on CUI
concept_df = pd.concat([subject_entities, object_entities]).drop_duplicates(subset=['CUI'])

In [None]:
concept_df = concept_df.drop(columns=['PREDICATION_ID'])

In [5]:
subject_columns = ['SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY',
                  'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 
                  'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE']

# Extract subject entities
subject_entities = merged_df[subject_columns].drop_duplicates()

# Rename columns to prepare for merging with object entities
concept_columns = ['CUI', 'NAME', 'SEMTYPE', 'NOVELTY', 'TEXT', 
                 'DIST', 'MAXDIST', 'START_INDEX', 'END_INDEX', 'SCORE']

subject_entities.columns = concept_columns

In [6]:
subject_columns = ['SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY',
                  'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 
                  'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE']

# Extract subject entities
subject_entities = merged_df[subject_columns].drop_duplicates()

# Rename columns to prepare for merging with object entities
concept_columns = ['CUI', 'NAME', 'SEMTYPE', 'NOVELTY', 'TEXT', 
                 'DIST', 'MAXDIST', 'START_INDEX', 'END_INDEX', 'SCORE']

subject_entities.columns = concept_columns

In [None]:
# Extract object entities using the same structure
object_columns = ['OBJECT_CUI', 'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY',
                 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST', 
                 'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE']

object_entities = merged_df[object_columns].drop_duplicates()
object_entities.columns = concept_columns

# Combine subject and object entities and remove duplicates based on CUI
concept_df = pd.concat([subject_entities, object_entities]).drop_duplicates(subset=['CUI'])

In [9]:
concept_df.to_csv('data/concept.csv', index=False, quoting=csv.QUOTE_ALL, header=False)

In [None]:
# Extract object entities using the same structure
object_columns = ['object_cui:STRING', 'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY',
                 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST', 
                 'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE']

object_entities = merged_df[object_columns].drop_duplicates()
object_entities.columns = concept_columns

# Combine subject and object entities and remove duplicates based on CUI
concept_df = pd.concat([subject_entities, object_entities]).drop_duplicates(subset=['CUI'])

In [None]:
print(f"Predication dataframe shape: {predication_df.shape}")
print(f"Entity dataframe shape: {concept_df.shape}")

In [None]:
predication_df.to_csv("predication.csv")

In [14]:
concept_df.to_csv("concept.csv")

# Creating Relationships
The following code will create a CSV with all the connections between the concepts and predicates in a format that is easily digestible by Neo4j.

In [None]:
predication_df = pd.read_csv("data/predication.csv", names=['PREDICATION_ID','SENTENCE_ID','PMID','PREDICATE','SUBJECT_CUI','OBJECT_CUI','INDICATOR_TYPE','PREDICATE_START_INDEX','PREDICATE_END_INDEX'])

In [None]:
predication_df

In [14]:
# Create an empty dataframe for connections
connections_columns = ['src_node', 'dest_node', 'label']
connections_df = pd.DataFrame(columns=connections_columns)

In [15]:
# 1. Connections between predication instances and subjects (inst_subject)
inst_subject_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['SUBJECT_CUI'],
    'label': 'inst_subject'
})

# 2. Connections between predication instances and objects (inst_object)
inst_object_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': 'inst_object'
})

# 3. Connections between subjects and objects (using PREDICATE as the label)
subject_object_connections = pd.DataFrame({
    'src_node': predication_df['SUBJECT_CUI'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': predication_df['PREDICATE']
})

In [16]:
# Combine all connections into the final connections dataframe
connections_df = pd.concat([
    inst_subject_connections,
    inst_object_connections,
    subject_object_connections
])

In [None]:
# Reset the index for the final dataframe
connections_df = connections_df.reset_index(drop=True)

# Display the result
print(f"Connections dataframe shape: {connections_df.shape}")
print(connections_df.head(10))

In [None]:
connections_df.to_csv("connections.csv", index=False, quoting=csv.QUOTE_ALL, header=False)

# Node Structure
The Neo4j import requires that you format your CSV with the datatypes in the header so here I am re-labeling the CSV. 

In [None]:
# For entity/concept file
# concept_df = pd.read_csv("data/concept.csv", index_col=0)
# concept_df.columns = [
#     "cui", 
#     "name:STRING", 
#     "semtype:LABEL", 
#     "novelty:FLOAT", 
#     "text:STRING", 
#     "dist", 
#     "maxdist", 
#     "start_index", 
#     "end_index", 
#     "score", 
#     "uuid"
# ]
# concept_df.to_csv("entity_neo4j.csv", index=False)

# For predication file - now with predicate as LABEL
predication_df = pd.read_csv("data/predication.csv")
predication_df.columns = [
    "predication_id:ID", 
    "sentence_id", 
    "pmid:STRING", 
    "predicate:LABEL",  # Changed from STRING to LABEL
    "subject_cui:STRING", 
    "object_cui:STRING", 
    "indicator_type:STRING", 
    "predicate_start_index", 
    "predicate_end_index"
]
predication_df.to_csv("predication_neo4j.csv", index=False)

# For connections/relationships file
connections_df = pd.read_csv("data/connections.csv")
connections_df.columns = [
    ":START_ID", 
    ":END_ID", 
    ":TYPE"
]
connections_df.to_csv("relationships_neo4j.csv", index=False)

# Verification and Validation

In [None]:
concept_df = pd.read_csv("concept.csv")
predication_df = pd.read_csv("predication.csv")
connections_df = pd.read_csv("connections.csv")

In [None]:
# Verification of nodes, relationships and properties
print("\n=== GRAPH STATISTICS ===")

# Count of nodes by type
print("\nNODE COUNTS:")
print(f"Concept/Entity nodes: {len(concept_df)}")
print(f"Predication nodes: {len(predication_df)}")
print(f"Total nodes: {len(concept_df) + len(predication_df)}")

# Count of relationships by type
print("\nRELATIONSHIP COUNTS:")
relationship_counts = connections_df[':TYPE'].value_counts()
print("Top 10 relationship types:")
print(relationship_counts.head(10))
print(f"Total relationships: {len(connections_df)}")

# Count of unique predicates
print("\nUNIQUE PREDICATES:")
unique_predicates = predication_df['predicate:LABEL'].nunique()
print(f"Number of unique predicates: {unique_predicates}")
print("Most common predicates:")
print(predication_df['predicate:LABEL'].value_counts().head(10))

# Property statistics
print("\nPROPERTY STATISTICS:")
print("Entity properties:")
for col in concept_df.columns:
    prop_name = col.split(':')[0]
    non_null = concept_df[col].count()
    print(f"  - {prop_name}: {non_null} non-null values ({non_null/len(concept_df):.2%} coverage)")

print("\nPredication properties:")
for col in predication_df.columns:
    prop_name = col.split(':')[0]
    non_null = predication_df[col].count()
    print(f"  - {prop_name}: {non_null} non-null values ({non_null/len(predication_df):.2%} coverage)")

# Graph density analysis
print("\nGRAPH DENSITY ANALYSIS:")
num_nodes = len(concept_df) + len(predication_df)
num_edges = len(connections_df)
max_possible_edges = num_nodes * (num_nodes - 1) / 2  # for undirected graph
graph_density = num_edges / max_possible_edges
print(f"Graph density: {graph_density:.8f}")

# Distribution of connections per node
print("\nCONNECTION DISTRIBUTION:")
src_connections = connections_df[':START_ID'].value_counts()
dest_connections = connections_df[':END_ID'].value_counts()

print("Source node connection statistics:")
print(f"  - Mean connections per node: {src_connections.mean():.2f}")
print(f"  - Median connections per node: {src_connections.median():.2f}")
print(f"  - Max connections: {src_connections.max()}")

print("Destination node connection statistics:")
print(f"  - Mean connections per node: {dest_connections.mean():.2f}")
print(f"  - Median connections per node: {dest_connections.median():.2f}")
print(f"  - Max connections: {dest_connections.max()}")

In [None]:
import pandas as pd
import numpy as np

def validate_semmed_data():
    """
    Validate the SemMedDB processed data for Neo4j import:
    1. Check data types for all columns
    2. Verify node uniqueness (no duplicates)
    3. Verify relationship integrity (start/end nodes exist)
    4. Check for missing values in key fields
    """
    print("Starting SemMedDB data validation...\n")
    
    # Load the three dataframes with low_memory=False to avoid dtype warnings
    print("Loading data...")
    concept_df = pd.read_csv("concept.csv", low_memory=False)
    predication_df = pd.read_csv("predication.csv", low_memory=False)
    connections_df = pd.read_csv("connections.csv", low_memory=False)
    
    # ===== 1. DATA TYPES VALIDATION =====
    print("\n===== DATA TYPES VALIDATION =====")
    
    # Expected data types for concept_df
    concept_dtypes = {
        'cui:ID': str,
        'name:STRING': str,
        'semtype:LABEL': str,
        'novelty:FLOAT': float,
        'text:STRING': str,
        'dist:INTEGER': int,
        'maxdist:INTEGER': int,
        'start_index:INTEGER': int,
        'end_index:INTEGER': int,
        'score:INTEGER': int
    }
    
    # Expected data types for predication_df
    predication_dtypes = {
        'predication_id:ID': int,
        'sentence_id:INTEGER': int,
        'pmid:STRING': str,
        'predicate:LABEL': str,
        'subject_cui:STRING': str,
        'object_cui:STRING': str,
        'indicator_type:STRING': str,
        'predicate_start_index:INTEGER': int,
        'predicate_end_index:INTEGER': int
    }
    
    # Expected data types for connections_df
    connections_dtypes = {
        ':START_ID': str,
        ':END_ID': str,
        ':TYPE': str
    }
    
    # Check data types for concept_df
    print("\nChecking concept_df data types:")
    for col, expected_type in concept_dtypes.items():
        if col in concept_df.columns:
            # Get actual type and handle mixed types
            actual_type = concept_df[col].dtype
            if pd.api.types.is_numeric_dtype(actual_type) and expected_type in [int, float]:
                is_valid = True
            elif pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                is_valid = True
            else:
                # For mixed types, check if conversion is possible
                try:
                    concept_df[col].astype(expected_type)
                    is_valid = True
                except:
                    is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            if not is_valid:
                # Show sample of problematic values
                print(f"    Sample values: {concept_df[col].head(3).tolist()}")
                
                # Try to identify specific issues
                if expected_type == int:
                    non_int_mask = ~concept_df[col].astype(str).str.match(r'^-?\d+$', na=False)
                    print(f"    Problematic non-integer values: {concept_df.loc[non_int_mask, col].head(3).tolist()}")
    
    # Check data types for predication_df
    print("\nChecking predication_df data types:")
    for col, expected_type in predication_dtypes.items():
        if col in predication_df.columns:
            actual_type = predication_df[col].dtype
            if pd.api.types.is_numeric_dtype(actual_type) and expected_type in [int, float]:
                is_valid = True
            elif pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                is_valid = True
            else:
                try:
                    predication_df[col].astype(expected_type)
                    is_valid = True
                except:
                    is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            if not is_valid:
                print(f"    Sample values: {predication_df[col].head(3).tolist()}")
                if expected_type == int:
                    non_int_mask = ~predication_df[col].astype(str).str.match(r'^-?\d+$', na=False)
                    print(f"    Problematic non-integer values: {predication_df.loc[non_int_mask, col].head(3).tolist()}")
    
    # Check data types for connections_df
    print("\nChecking connections_df data types:")
    for col, expected_type in connections_dtypes.items():
        if col in connections_df.columns:
            actual_type = connections_df[col].dtype
            
            # For ID columns, check if they are strings or can be converted to strings
            if col in [':START_ID', ':END_ID']:
                if pd.api.types.is_string_dtype(actual_type):
                    is_valid = True
                else:
                    # Check if all values can be converted to string without issues
                    try:
                        connections_df[col].astype(str)
                        is_valid = True
                        print(f"    Note: {col} will need conversion to string type")
                    except:
                        is_valid = False
            else:
                # For other columns
                if pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                    is_valid = True
                else:
                    try:
                        connections_df[col].astype(expected_type)
                        is_valid = True
                    except:
                        is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            # Show sample of values to verify
            print(f"    Sample values: {connections_df[col].head(3).tolist()}")
            
            # For ID columns, analyze value types
            if col in [':START_ID', ':END_ID']:
                num_values = connections_df[col].count()
                num_numeric = connections_df[col].apply(lambda x: isinstance(x, (int, float)) or 
                                                      (isinstance(x, str) and x.isdigit())).sum()
                num_string = num_values - num_numeric
                
                print(f"    Value type breakdown: {num_numeric} numeric ({num_numeric/num_values:.2%}), "
                      f"{num_string} non-numeric ({num_string/num_values:.2%})")
    
    # ===== 2. MISSING VALUES VALIDATION =====
    print("\n===== MISSING VALUES VALIDATION =====")
    
    # Check for missing values in key fields
    print("\nConcept DataFrame Missing Values in Key Fields:")
    missing_concept = concept_df['cui:ID'].isnull().sum()
    print(f"  - Primary key 'cui:ID': {missing_concept} missing values ({missing_concept/len(concept_df):.2%})")
    
    print("\nPredication DataFrame Missing Values in Key Fields:")
    missing_pred_id = predication_df['predication_id:ID'].isnull().sum()
    missing_subject = predication_df['subject_cui:STRING'].isnull().sum()
    missing_object = predication_df['object_cui:STRING'].isnull().sum()
    print(f"  - Primary key 'predication_id:ID': {missing_pred_id} missing values ({missing_pred_id/len(predication_df):.2%})")
    print(f"  - Foreign key 'subject_cui:STRING': {missing_subject} missing values ({missing_subject/len(predication_df):.2%})")
    print(f"  - Foreign key 'object_cui:STRING': {missing_object} missing values ({missing_object/len(predication_df):.2%})")
    
    print("\nConnections DataFrame Missing Values in Key Fields:")
    missing_src = connections_df[':START_ID'].isnull().sum()
    missing_dest = connections_df[':END_ID'].isnull().sum()
    missing_label = connections_df[':TYPE'].isnull().sum()
    print(f"  - ':START_ID': {missing_src} missing values ({missing_src/len(connections_df):.2%})")
    print(f"  - ':END_ID': {missing_dest} missing values ({missing_dest/len(connections_df):.2%})")
    print(f"  - ':TYPE': {missing_label} missing values ({missing_label/len(connections_df):.2%})")
    
    # ===== 3. NODE UNIQUENESS VALIDATION =====
    print("\n===== NODE UNIQUENESS VALIDATION =====")
    
    # Check for duplicate CUIs in concept_df
    duplicate_cuis = concept_df['cui:ID'].duplicated().sum()
    print(f"Duplicate CUIs in concept_df: {duplicate_cuis} ({duplicate_cuis/len(concept_df):.2%})")
    
    # Check for duplicate PREDICATION_IDs in predication_df
    duplicate_preds = predication_df['predication_id:ID'].duplicated().sum()
    print(f"Duplicate PREDICATION_IDs in predication_df: {duplicate_preds} ({duplicate_preds/len(predication_df):.2%})")
    
    # ===== 4. RELATIONSHIP INTEGRITY VALIDATION =====
    print("\n===== RELATIONSHIP INTEGRITY VALIDATION =====")
    
    # Get distinct node IDs from the respective dataframes
    concept_ids = set(concept_df['cui:ID'].dropna().astype(str))
    predication_ids = set(predication_df['predication_id:ID'].dropna().astype(str))
    
    # All valid node IDs (combined)
    all_valid_nodes = concept_ids.union(predication_ids)
    
    # Ensure IDs are strings for comparison
    connections_df['start_id_str'] = connections_df[':START_ID'].astype(str)
    connections_df['end_id_str'] = connections_df[':END_ID'].astype(str)
    
    # Check if relationship src_nodes exist
    src_nodes = set(connections_df['start_id_str'].dropna())
    invalid_src_nodes = src_nodes - all_valid_nodes
    invalid_src_count = len(invalid_src_nodes)
    
    print(f"\nInvalid source nodes in relationships: {invalid_src_count} ({invalid_src_count/len(src_nodes):.2%})")
    if invalid_src_count > 0 and invalid_src_count <= 10:
        print(f"  Sample of invalid source nodes: {list(invalid_src_nodes)[:10]}")
    
    # Check if relationship dest_nodes exist
    dest_nodes = set(connections_df['end_id_str'].dropna())
    invalid_dest_nodes = dest_nodes - all_valid_nodes
    invalid_dest_count = len(invalid_dest_nodes)
    
    print(f"\nInvalid destination nodes in relationships: {invalid_dest_count} ({invalid_dest_count/len(dest_nodes):.2%})")
    if invalid_dest_count > 0 and invalid_dest_count <= 10:
        print(f"  Sample of invalid destination nodes: {list(invalid_dest_nodes)[:10]}")
    
    # ===== 5. SUMMARY =====
    print("\n===== VALIDATION SUMMARY =====")
    
    # Count total issues
    total_issues = (missing_concept + missing_pred_id + missing_subject + missing_object + 
                    missing_src + missing_dest + missing_label +
                    duplicate_cuis + duplicate_preds +
                    invalid_src_count + invalid_dest_count)
    
    if total_issues == 0:
        print("✅ All validations passed! Data appears clean and ready for Neo4j import.")
    else:
        print(f"❌ Found {total_issues} total issues that may affect your Neo4j import.")
        
        # Recommend fixes based on issues found
        print("\nRecommended fixes:")
        
        if missing_concept + missing_pred_id > 0:
            print("  - Remove rows with missing primary keys (cui:ID or predication_id:ID)")
        
        if missing_subject + missing_object > 0:
            print("  - Fix or remove predications with missing subject_cui:STRING or object_cui:STRING")
        
        if missing_src + missing_dest + missing_label > 0:
            print("  - Remove relationships with missing :START_ID, :END_ID, or :TYPE")
        
        if duplicate_cuis > 0:
            print("  - Remove duplicate CUIs or merge their properties")
        
        if duplicate_preds > 0:
            print("  - Remove duplicate predication_id:IDs")
        
        if invalid_src_count + invalid_dest_count > 0:
            print("  - Remove relationships with invalid source or destination nodes")
            
        # Check if START_ID and END_ID need type conversion
        if not pd.api.types.is_string_dtype(connections_df[':START_ID'].dtype) or \
           not pd.api.types.is_string_dtype(connections_df[':END_ID'].dtype):
            print("  - Convert ':START_ID' and ':END_ID' columns to string type (see fix_connections_dtypes function)")

def fix_connections_dtypes(file_path="connections.csv"):
    """
    Fix data types in the connections dataframe:
    - Convert :START_ID and :END_ID to string type
    - Save the fixed version
    """
    # Load the connections CSV
    connections_df = pd.read_csv(file_path, low_memory=False)
    
    # Convert src_node and dest_node to string
    connections_df[':START_ID'] = connections_df[':START_ID'].astype(str)
    connections_df[':END_ID'] = connections_df[':END_ID'].astype(str)
    
    # Save the fixed version
    connections_df.to_csv("connections_fixed.csv", index=False)
    print(f"Fixed connections data saved to 'connections_fixed.csv'")
    return connections_df

validate_semmed_data()

In [None]:
# changing the :ID col to uuid instead of cui since some of them are compound cuis
import pandas as pd
import uuid

# Function to generate UUID
def generate_uuid():
    return str(uuid.uuid4())

# Read the files
concept_df = pd.read_csv("data/concept.csv")
connections_df = pd.read_csv("data/connections.csv")

# Add UUID column to concept_df and rename CUI column
concept_df[':ID'] = [generate_uuid() for _ in range(len(concept_df))]
concept_df.rename(columns={'cui:ID': 'cui:STRING'}, inplace=True)

# Create a mapping dictionary from CUI to UUID
cui_to_uuid = dict(zip(concept_df['cui:STRING'], concept_df[':ID']))

# Update connections in connections_df where END_ID matches a CUI pattern (including compound CUIs)
cui_mask = connections_df[':END_ID'].str.contains(r'C\d+', regex=True, na=False)
connections_df.loc[cui_mask, ':END_ID'] = connections_df.loc[cui_mask, ':END_ID'].map(cui_to_uuid)

# Save the modified files
concept_df.to_csv("concept_updated.csv", index=False)
connections_df.to_csv("connections_updated.csv", index=False)

# Display sample of the changes
print("\nUpdated concept.csv sample:")
print(concept_df.head(3).to_string())
print("\nUpdated connections.csv sample:")
print(connections_df.head(3).to_string())

In [None]:
# Check for predication rows where subject_cui or object_cui doesn't match the C******* pattern
import pandas as pd
import re

# Read the predication data
predication_df = pd.read_csv("data/predication.csv")

# Define a function to check if a CUI follows the standard format
def is_not_standard_cui(cui_str):
    # Check if the string is None or NaN
    if pd.isna(cui_str):
        return True
    
    # Check if the string doesn't match the C******* pattern
    # This will also catch compound CUIs with pipe separators
    if not re.match(r'^C\d+$', str(cui_str)):
        # If it's a compound CUI with pipe separator, check each part
        if '|' in str(cui_str):
            parts = str(cui_str).split('|')
            # If any part doesn't match C******* or is purely numeric, flag it
            return any(not (re.match(r'^C\d+$', part) or part.isdigit()) for part in parts)
        return True
    return False

# Find rows where subject_cui doesn't match the pattern
non_standard_subject = predication_df[predication_df['subject_cui:STRING'].apply(is_not_standard_cui)]

# Find rows where object_cui doesn't match the pattern
non_standard_object = predication_df[predication_df['object_cui:STRING'].apply(is_not_standard_cui)]

# Display the results
print("Rows where subject_cui doesn't match the C******* pattern:")
if len(non_standard_subject) > 0:
    print(non_standard_subject)
else:
    print("No rows found with non-standard subject_cui format.")

print("\nRows where object_cui doesn't match the C******* pattern:")
if len(non_standard_object) > 0:
    print(non_standard_object)
else:
    print("No rows found with non-standard object_cui format.")

# Count of non-standard CUIs
print(f"\nTotal rows with non-standard subject_cui: {len(non_standard_subject)}")
print(f"Total rows with non-standard object_cui: {len(non_standard_object)}")

In [None]:
import pandas as pd
import re

# Read the predication data
df = pd.read_csv("semmed_data/predication.csv", 
                 names=predication_headers,
                 encoding='ISO-8859-1',  # Add this parameter
                 on_bad_lines='warn',    # Optionally handle bad lines
                 na_values=['\\N'])      # Handle NULL values

def is_non_standard_cui(cui):
    # Handle NaN/None values
    if pd.isna(cui):
        return True
        
    cui_str = str(cui)
    # Split for compound CUIs
    cuis = cui_str.split('|')
    
    for single_cui in cuis:
        # Check if it matches the standard C******* pattern
        if not re.match(r'^C\d+$', single_cui):
            # If it's a pure number (like "3075" in compound CUIs), it's acceptable
            if not single_cui.isdigit():
                return True
    return False

# Find non-standard CUIs
non_standard_subjects = df[df['SUBJECT_CUI'].apply(is_non_standard_cui)]
non_standard_objects = df[df['OBJECT_CUI'].apply(is_non_standard_cui)]

# Print summary
print("=== Non-standard CUI Analysis ===")
print(f"\nTotal rows in dataset: {len(df)}")
print(f"Rows with non-standard subject CUIs: {len(non_standard_subjects)}")
print(f"Rows with non-standard object CUIs: {len(non_standard_objects)}")

# Show sample of non-standard entries
print("\nSample of non-standard subject CUIs:")
print(non_standard_subjects[['PREDICATION_ID', 'SUBJECT_CUI', 'SUBJECT_NAME']].head())

print("\nSample of non-standard object CUIs:")
print(non_standard_objects[['PREDICATION_ID', 'OBJECT_CUI', 'OBJECT_NAME']].head())

# Get unique patterns of non-standard CUIs
print("\nUnique patterns of non-standard subject CUIs:")
print(non_standard_subjects['SUBJECT_CUI'].unique()[:10])

print("\nUnique patterns of non-standard object CUIs:")
print(non_standard_objects['OBJECT_CUI'].unique()[:10])

In [None]:
import csv

# Output all non-standard CUIs to a CSV file
output_file = "non_standard_cuis.csv"

# Get all unique non-standard CUIs
subject_cuis = non_standard_subjects['SUBJECT_CUI'].dropna().unique().tolist()
object_cuis = non_standard_objects['OBJECT_CUI'].dropna().unique().tolist()

# Combine all unique non-standard CUIs
all_non_standard_cuis = list(set(subject_cuis + object_cuis))

# Write to CSV file (one CUI per line)
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['cui'])  # Header
    for cui in all_non_standard_cuis:
        writer.writerow([cui])

print(f"All {len(all_non_standard_cuis)} unique non-standard CUIs have been written to {output_file}")


In [None]:
# Search for a specific CUI in the predication.csv file
import pandas as pd

# Define the CUI to search for
search_cui = "7523"

# Read the predication.csv file
# Note: Adjust the file path if needed
# First try to read from the original predication.csv file
pred_df = pd.read_csv("semmed_data/predication.csv", 
                names=predication_headers,
                encoding='ISO-8859-1',  # Add this parameter
                on_bad_lines='warn',    # Optionally handle bad lines
                na_values=['\\N']) 

# Search for the CUI in either subject or object columns
matching_rows = pred_df[(pred_df['SUBJECT_CUI'].astype(str).str.contains(search_cui)) | 
                        (pred_df['OBJECT_CUI'].astype(str).str.contains(search_cui))]

if len(matching_rows) > 0:
    print(f"Found {len(matching_rows)} rows containing CUI '{search_cui}':")
    print(matching_rows)
else:
    print(f"No rows found containing CUI '{search_cui}'")

In [None]:
import csv
import pandas as pd
from pathlib import Path

def process_predication_csv():
    input_path = 'data/predication.csv'
    output_path = 'data/predication_prefixed.csv'
    
    # Read the predication header to get column names
    with open('data/predication_header.csv', 'r') as header_file:
        header_reader = csv.reader(header_file)
        headers = next(header_reader)
    
    # Process the predication file
    print("Processing predication.csv to add K prefix to non-C CUIs...")
    
    # Read the file in chunks to handle large files efficiently
    chunk_size = 100000
    chunks_processed = 0
    examples_shown = 0
    
    with open(output_path, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(headers)  # Write header row
        
        for chunk in pd.read_csv(input_path, names=headers, chunksize=chunk_size):
            modified_rows = 0
            
            for _, row in chunk.iterrows():
                row_data = row.tolist()
                
                # Check and modify SUBJECT_CUI if it doesn't start with 'C'
                if isinstance(row_data[4], str) and not row_data[4].startswith('C'):
                    row_data[4] = f'K{row_data[4]}'
                    modified_rows += 1
                
                # Check and modify OBJECT_CUI if it doesn't start with 'C'
                if isinstance(row_data[5], str) and not row_data[5].startswith('C'):
                    row_data[5] = f'K{row_data[5]}'
                    modified_rows += 1
                
                writer.writerow(row_data)
                
                # Print a few examples of modified rows
                if modified_rows > 0 and examples_shown < 5 and (row_data[4].startswith('K') or row_data[5].startswith('K')):
                    print(f"Modified row: {row_data}")
                    examples_shown += 1
            
            chunks_processed += 1
            print(f"Processed chunk {chunks_processed}, modified {modified_rows} CUIs")
    
    print(f"Processing complete. Check {output_path}")

if __name__ == "__main__":
    process_predication_csv()

In [9]:
concept_df = pd.read_csv("data/concept_prefixed.csv")

In [None]:
concept_df

In [11]:
concept_df.to_csv("concept.csv", index=False, quoting=csv.QUOTE_ALL)

In [11]:
# Utility to add a column
import csv

def add_concept_column(input_file, output_file):
    try:
        with open(input_file, 'r', newline='') as infile, \
             open(output_file, 'w', newline='') as outfile:
            
            reader = csv.reader(infile)
            writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
            
            # Process each row and add 'Concept'
            for row in reader:
                row.append('Predication')
                writer.writerow(row)
                
    except Exception as e:
        print(f"An error occurred: {e}")

# Usage
add_concept_column('data/predication.csv', 'predication.csv')

In [10]:
# Utility script to restore quotes on a CSV
import pandas as pd
df = pd.read_csv('data/concept.csv'); df.to_csv('output.csv', quoting=1, index=False)