In [2]:
import pandas as pd

In [2]:
# Define the column headers
predication_headers = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 'SUBJECT_CUI',
    'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'FACT_VALUE_CHAR',
    'MOD_SCALE_CHAR', 'MOD_VALUE_FLOAT'
]

predication_aux_headers = [
    'PREDICATION_AUX_ID', 'PREDICATION_ID', 'SUBJECT_TEXT', 'SUBJECT_DIST',
    'SUBJECT_MAXDIST', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE',
    'INDICATOR_TYPE', 'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'OBJECT_TEXT',
    'OBJECT_DIST', 'OBJECT_MAXDIST', 'OBJECT_START_INDEX', 'OBJECT_END_INDEX',
    'OBJECT_SCORE', 'CURR_TIMESTAMP'
]

# predication_dtype = {
#     'PREDICATION_ID': 'int32',
#     'SENTENCE_ID': 'int32',
#     'PMID': 'str',
#     'PREDICATE': 'str',
#     'SUBJECT_CUI': 'str',
#     'SUBJECT_NAME': 'str',
#     'SUBJECT_SEMTYPE': 'str',
#     'SUBJECT_NOVELTY': 'int8',
#     'OBJECT_CUI': 'str',
#     'OBJECT_NAME': 'str',
#     'OBJECT_SEMTYPE': 'str',
#     'OBJECT_NOVELTY': 'int8',
#     'FACT_VALUE_CHAR': 'str',
#     'MOD_SCALE_CHAR': 'str',
#     'MOD_VALUE_FLOAT': 'float32'
# }

# predication_aux_dtype = {
#     'PREDICATION_AUX_ID': 'int32',
#     'PREDICATION_ID': 'int32',
#     'SUBJECT_TEXT': 'str',
#     'SUBJECT_DIST': 'int32',
#     'SUBJECT_MAXDIST': 'int32',
#     'SUBJECT_START_INDEX': 'int32',
#     'SUBJECT_END_INDEX': 'int32',
#     'SUBJECT_SCORE': 'int32',
#     'INDICATOR_TYPE': 'str',
#     'PREDICATE_START_INDEX': 'int32',
#     'PREDICATE_END_INDEX': 'int32',
#     'OBJECT_TEXT': 'str',
#     'OBJECT_DIST': 'int32',
#     'OBJECT_MAXDIST': 'int32',
#     'OBJECT_START_INDEX': 'int32',
#     'OBJECT_END_INDEX': 'int32',
#     'OBJECT_SCORE': 'int32',
#     'CURR_TIMESTAMP': 'string'  # Assuming timestamp is read as string
# }

# Creating Nodes
The following code will create two CSVs for entity and predication nodes. 

In [None]:
# Read the CSV file with the specified headers using Dask
df = pd.read_csv('semmed_data/predication.csv', names=predication_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N'])

# Read the CSV file with the specified headers using Dask
df_aux = pd.read_csv('semmed_data/predication_aux.csv', names=predication_aux_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N']) 

In [None]:
df.head(5)

In [None]:
df_aux.head(5)

In [None]:
# # Export both DataFrames to a single .pkl file
# dataframes = {'df': df, 'df_aux': df_aux}
# dd.to_pickle(dataframes, 'semmed_data/dataframes.pkl')

In [18]:
# # Merge with Dask
# merged_df = dd.merge(df, df_aux, on='PREDICATION_ID', how='inner', indicator=True)

In [None]:
# df.compute()

In [4]:
merged_df = pd.merge(df, df_aux, on='PREDICATION_ID', how='inner')

In [None]:
merged_df.head(5)

In [None]:
merged_df.columns.values

In [7]:
predication_columns = ['PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE',
                      'SUBJECT_CUI', 'OBJECT_CUI', 'INDICATOR_TYPE', 
                      'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX']

predication_df = merged_df[predication_columns].drop_duplicates()

In [None]:
subject_columns = ['SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY',
                  'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 
                  'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE']

# Extract subject entities
subject_entities = merged_df[subject_columns].drop_duplicates()

# Rename columns to prepare for merging with object entities
concept_columns = ['CUI', 'NAME', 'SEMTYPE', 'NOVELTY', 'TEXT', 
                 'DIST', 'MAXDIST', 'START_INDEX', 'END_INDEX', 'SCORE']

subject_entities.columns = concept_columns

In [11]:
# Extract object entities using the same structure
object_columns = ['OBJECT_CUI', 'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY',
                 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST', 
                 'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE']

object_entities = merged_df[object_columns].drop_duplicates()
object_entities.columns = concept_columns

# Combine subject and object entities and remove duplicates based on CUI
concept_df = pd.concat([subject_entities, object_entities]).drop_duplicates(subset=['CUI'])

In [None]:
print(f"Predication dataframe shape: {predication_df.shape}")
print(f"Entity dataframe shape: {concept_df.shape}")

In [13]:
predication_df.to_csv("predication.csv")

In [14]:
concept_df.to_csv("concept.csv")

# Creating Relationships
The following code will create a CSV with all the connections between the concepts and predicates in a format that is easily digestible by Neo4j.

In [3]:
predication_df = pd.read_csv("predication.csv", index_col=0)

  predication_df = pd.read_csv("predication.csv", index_col=0)


In [4]:
# Create an empty dataframe for connections
connections_columns = ['src_node', 'dest_node', 'label']
connections_df = pd.DataFrame(columns=connections_columns)

In [5]:
# 1. Connections between predication instances and subjects (inst_subject)
inst_subject_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['SUBJECT_CUI'],
    'label': 'inst_subject'
})

# 2. Connections between predication instances and objects (inst_object)
inst_object_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': 'inst_object'
})

# 3. Connections between subjects and objects (using PREDICATE as the label)
subject_object_connections = pd.DataFrame({
    'src_node': predication_df['SUBJECT_CUI'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': predication_df['PREDICATE']
})

In [6]:
# Combine all connections into the final connections dataframe
connections_df = pd.concat([
    inst_subject_connections,
    inst_object_connections,
    subject_object_connections
])

In [7]:
# Reset the index for the final dataframe
connections_df = connections_df.reset_index(drop=True)

# Display the result
print(f"Connections dataframe shape: {connections_df.shape}")
print(connections_df.head(10))

Connections dataframe shape: (391440519, 3)
   src_node      dest_node         label
0  10592604       C0003725  inst_subject
1  10592697       C0039258  inst_subject
2  10592728       C0318627  inst_subject
3  10592759       C0446169  inst_subject
4  10592832       C0012634  inst_subject
5  10592873       C0042776  inst_subject
6  10593057       C0999630  inst_subject
7  10593208       C0242210  inst_subject
8  10593243  C0056207|3075  inst_subject
9  10593287       C0242210  inst_subject


In [8]:
connections_df.to_csv("connections.csv")