In [1]:
import pandas as pd

In [2]:
# Define the column headers
predication_headers = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 'SUBJECT_CUI',
    'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'FACT_VALUE_CHAR',
    'MOD_SCALE_CHAR', 'MOD_VALUE_FLOAT'
]

predication_aux_headers = [
    'PREDICATION_AUX_ID', 'PREDICATION_ID', 'SUBJECT_TEXT', 'SUBJECT_DIST',
    'SUBJECT_MAXDIST', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE',
    'INDICATOR_TYPE', 'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'OBJECT_TEXT',
    'OBJECT_DIST', 'OBJECT_MAXDIST', 'OBJECT_START_INDEX', 'OBJECT_END_INDEX',
    'OBJECT_SCORE', 'CURR_TIMESTAMP'
]

# predication_dtype = {
#     'PREDICATION_ID': 'int32',
#     'SENTENCE_ID': 'int32',
#     'PMID': 'str',
#     'PREDICATE': 'str',
#     'SUBJECT_CUI': 'str',
#     'SUBJECT_NAME': 'str',
#     'SUBJECT_SEMTYPE': 'str',
#     'SUBJECT_NOVELTY': 'int8',
#     'object_cui:STRING': 'str',
#     'OBJECT_NAME': 'str',
#     'OBJECT_SEMTYPE': 'str',
#     'OBJECT_NOVELTY': 'int8',
#     'FACT_VALUE_CHAR': 'str',
#     'MOD_SCALE_CHAR': 'str',
#     'MOD_VALUE_FLOAT': 'float32'
# }

# predication_aux_dtype = {
#     'PREDICATION_AUX_ID': 'int32',
#     'PREDICATION_ID': 'int32',
#     'SUBJECT_TEXT': 'str',
#     'SUBJECT_DIST': 'int32',
#     'SUBJECT_MAXDIST': 'int32',
#     'SUBJECT_START_INDEX': 'int32',
#     'SUBJECT_END_INDEX': 'int32',
#     'SUBJECT_SCORE': 'int32',
#     'INDICATOR_TYPE': 'str',
#     'PREDICATE_START_INDEX': 'int32',
#     'PREDICATE_END_INDEX': 'int32',
#     'OBJECT_TEXT': 'str',
#     'OBJECT_DIST': 'int32',
#     'OBJECT_MAXDIST': 'int32',
#     'OBJECT_START_INDEX': 'int32',
#     'OBJECT_END_INDEX': 'int32',
#     'OBJECT_SCORE': 'int32',
#     'CURR_TIMESTAMP': 'string'  # Assuming timestamp is read as string
# }

# Creating Nodes
The following code will create two CSVs for entity and predication nodes. 

In [None]:
# Read the CSV file with the specified headers using Dask
df = pd.read_csv('semmed_data/predication.csv', names=predication_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N'])

# Read the CSV file with the specified headers using Dask
df_aux = pd.read_csv('semmed_data/predication_aux.csv', names=predication_aux_headers, encoding='ISO-8859-1', on_bad_lines='warn', na_values=['\\N']) 

In [None]:
df.head(5)

In [None]:
df_aux.head(5)

In [None]:
# # Export both DataFrames to a single .pkl file
# dataframes = {'df': df, 'df_aux': df_aux}
# dd.to_pickle(dataframes, 'semmed_data/dataframes.pkl')

In [18]:
# # Merge with Dask
# merged_df = dd.merge(df, df_aux, on='PREDICATION_ID', how='inner', indicator=True)

In [None]:
# df.compute()

In [4]:
merged_df = pd.merge(df, df_aux, on='PREDICATION_ID', how='inner')

In [None]:
merged_df.head(5)

In [None]:
merged_df.columns.values

In [7]:
predication_columns = ['PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE',
                      'SUBJECT_CUI', 'OBJECT_CUI', 'INDICATOR_TYPE', 
                      'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX']

predication_df = merged_df[predication_columns].drop_duplicates()

In [None]:
subject_columns = ['SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY',
                  'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 
                  'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE']

# Extract subject entities
subject_entities = merged_df[subject_columns].drop_duplicates()

# Rename columns to prepare for merging with object entities
concept_columns = ['CUI', 'NAME', 'SEMTYPE', 'NOVELTY', 'TEXT', 
                 'DIST', 'MAXDIST', 'START_INDEX', 'END_INDEX', 'SCORE']

subject_entities.columns = concept_columns

In [11]:
# Extract object entities using the same structure
object_columns = ['object_cui:STRING', 'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY',
                 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST', 
                 'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE']

object_entities = merged_df[object_columns].drop_duplicates()
object_entities.columns = concept_columns

# Combine subject and object entities and remove duplicates based on CUI
concept_df = pd.concat([subject_entities, object_entities]).drop_duplicates(subset=['CUI'])

In [None]:
print(f"Predication dataframe shape: {predication_df.shape}")
print(f"Entity dataframe shape: {concept_df.shape}")

In [13]:
predication_df.to_csv("predication.csv")

In [14]:
concept_df.to_csv("concept.csv")

# Creating Relationships
The following code will create a CSV with all the connections between the concepts and predicates in a format that is easily digestible by Neo4j.

In [None]:
predication_df = pd.read_csv("predication.csv", index_col=0)

In [4]:
# Create an empty dataframe for connections
connections_columns = ['src_node', 'dest_node', 'label']
connections_df = pd.DataFrame(columns=connections_columns)

In [5]:
# 1. Connections between predication instances and subjects (inst_subject)
inst_subject_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['SUBJECT_CUI'],
    'label': 'inst_subject'
})

# 2. Connections between predication instances and objects (inst_object)
inst_object_connections = pd.DataFrame({
    'src_node': predication_df['PREDICATION_ID'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': 'inst_object'
})

# 3. Connections between subjects and objects (using PREDICATE as the label)
subject_object_connections = pd.DataFrame({
    'src_node': predication_df['SUBJECT_CUI'],
    'dest_node': predication_df['OBJECT_CUI'],
    'label': predication_df['PREDICATE']
})

In [6]:
# Combine all connections into the final connections dataframe
connections_df = pd.concat([
    inst_subject_connections,
    inst_object_connections,
    subject_object_connections
])

In [None]:
# Reset the index for the final dataframe
connections_df = connections_df.reset_index(drop=True)

# Display the result
print(f"Connections dataframe shape: {connections_df.shape}")
print(connections_df.head(10))

In [8]:
connections_df.to_csv("connections.csv")

# Node Structure
The Neo4j import requires that you format your CSV with the datatypes in the header so here I am re-labeling the CSV. 

In [None]:
# For entity/concept file
concept_df = pd.read_csv("concept.csv", index_col=0)
concept_df.columns = [
    "cui:ID", 
    "name:STRING", 
    "semtype:LABEL", 
    "novelty:FLOAT", 
    "text:STRING", 
    "dist:INTEGER", 
    "maxdist:INTEGER", 
    "start_index:INTEGER", 
    "end_index:INTEGER", 
    "score:INTEGER"
]
concept_df.to_csv("entity_neo4j.csv", index=False)

# For predication file - now with predicate as LABEL
predication_df = pd.read_csv("predication.csv", index_col=0)
predication_df.columns = [
    "predication_id:ID", 
    "sentence_id:INTEGER", 
    "pmid:STRING", 
    "predicate:LABEL",  # Changed from STRING to LABEL
    "subject_cui:STRING", 
    "object_cui:STRING", 
    "indicator_type:STRING", 
    "predicate_start_index:INTEGER", 
    "predicate_end_index:INTEGER"
]
predication_df.to_csv("predication_neo4j.csv", index=False)

# For connections/relationships file
connections_df = pd.read_csv("connections.csv", index_col=0)
connections_df.columns = [
    ":START_ID", 
    ":END_ID", 
    ":TYPE"
]
connections_df.to_csv("relationships_neo4j.csv", index=False)

# Verification and Validation

In [None]:
concept_df = pd.read_csv("concept.csv")
predication_df = pd.read_csv("predication.csv")
connections_df = pd.read_csv("connections.csv")

In [None]:
# Verification of nodes, relationships and properties
print("\n=== GRAPH STATISTICS ===")

# Count of nodes by type
print("\nNODE COUNTS:")
print(f"Concept/Entity nodes: {len(concept_df)}")
print(f"Predication nodes: {len(predication_df)}")
print(f"Total nodes: {len(concept_df) + len(predication_df)}")

# Count of relationships by type
print("\nRELATIONSHIP COUNTS:")
relationship_counts = connections_df[':TYPE'].value_counts()
print("Top 10 relationship types:")
print(relationship_counts.head(10))
print(f"Total relationships: {len(connections_df)}")

# Count of unique predicates
print("\nUNIQUE PREDICATES:")
unique_predicates = predication_df['predicate:LABEL'].nunique()
print(f"Number of unique predicates: {unique_predicates}")
print("Most common predicates:")
print(predication_df['predicate:LABEL'].value_counts().head(10))

# Property statistics
print("\nPROPERTY STATISTICS:")
print("Entity properties:")
for col in concept_df.columns:
    prop_name = col.split(':')[0]
    non_null = concept_df[col].count()
    print(f"  - {prop_name}: {non_null} non-null values ({non_null/len(concept_df):.2%} coverage)")

print("\nPredication properties:")
for col in predication_df.columns:
    prop_name = col.split(':')[0]
    non_null = predication_df[col].count()
    print(f"  - {prop_name}: {non_null} non-null values ({non_null/len(predication_df):.2%} coverage)")

# Graph density analysis
print("\nGRAPH DENSITY ANALYSIS:")
num_nodes = len(concept_df) + len(predication_df)
num_edges = len(connections_df)
max_possible_edges = num_nodes * (num_nodes - 1) / 2  # for undirected graph
graph_density = num_edges / max_possible_edges
print(f"Graph density: {graph_density:.8f}")

# Distribution of connections per node
print("\nCONNECTION DISTRIBUTION:")
src_connections = connections_df[':START_ID'].value_counts()
dest_connections = connections_df[':END_ID'].value_counts()

print("Source node connection statistics:")
print(f"  - Mean connections per node: {src_connections.mean():.2f}")
print(f"  - Median connections per node: {src_connections.median():.2f}")
print(f"  - Max connections: {src_connections.max()}")

print("Destination node connection statistics:")
print(f"  - Mean connections per node: {dest_connections.mean():.2f}")
print(f"  - Median connections per node: {dest_connections.median():.2f}")
print(f"  - Max connections: {dest_connections.max()}")

In [1]:
import pandas as pd
import numpy as np

def validate_semmed_data():
    """
    Validate the SemMedDB processed data for Neo4j import:
    1. Check data types for all columns
    2. Verify node uniqueness (no duplicates)
    3. Verify relationship integrity (start/end nodes exist)
    4. Check for missing values in key fields
    """
    print("Starting SemMedDB data validation...\n")
    
    # Load the three dataframes with low_memory=False to avoid dtype warnings
    print("Loading data...")
    concept_df = pd.read_csv("concept.csv", low_memory=False)
    predication_df = pd.read_csv("predication.csv", low_memory=False)
    connections_df = pd.read_csv("connections.csv", low_memory=False)
    
    # ===== 1. DATA TYPES VALIDATION =====
    print("\n===== DATA TYPES VALIDATION =====")
    
    # Expected data types for concept_df
    concept_dtypes = {
        'cui:ID': str,
        'name:STRING': str,
        'semtype:LABEL': str,
        'novelty:FLOAT': float,
        'text:STRING': str,
        'dist:INTEGER': int,
        'maxdist:INTEGER': int,
        'start_index:INTEGER': int,
        'end_index:INTEGER': int,
        'score:INTEGER': int
    }
    
    # Expected data types for predication_df
    predication_dtypes = {
        'predication_id:ID': int,
        'sentence_id:INTEGER': int,
        'pmid:STRING': str,
        'predicate:LABEL': str,
        'subject_cui:STRING': str,
        'object_cui:STRING': str,
        'indicator_type:STRING': str,
        'predicate_start_index:INTEGER': int,
        'predicate_end_index:INTEGER': int
    }
    
    # Expected data types for connections_df
    connections_dtypes = {
        ':START_ID': str,
        ':END_ID': str,
        ':TYPE': str
    }
    
    # Check data types for concept_df
    print("\nChecking concept_df data types:")
    for col, expected_type in concept_dtypes.items():
        if col in concept_df.columns:
            # Get actual type and handle mixed types
            actual_type = concept_df[col].dtype
            if pd.api.types.is_numeric_dtype(actual_type) and expected_type in [int, float]:
                is_valid = True
            elif pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                is_valid = True
            else:
                # For mixed types, check if conversion is possible
                try:
                    concept_df[col].astype(expected_type)
                    is_valid = True
                except:
                    is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            if not is_valid:
                # Show sample of problematic values
                print(f"    Sample values: {concept_df[col].head(3).tolist()}")
                
                # Try to identify specific issues
                if expected_type == int:
                    non_int_mask = ~concept_df[col].astype(str).str.match(r'^-?\d+$', na=False)
                    print(f"    Problematic non-integer values: {concept_df.loc[non_int_mask, col].head(3).tolist()}")
    
    # Check data types for predication_df
    print("\nChecking predication_df data types:")
    for col, expected_type in predication_dtypes.items():
        if col in predication_df.columns:
            actual_type = predication_df[col].dtype
            if pd.api.types.is_numeric_dtype(actual_type) and expected_type in [int, float]:
                is_valid = True
            elif pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                is_valid = True
            else:
                try:
                    predication_df[col].astype(expected_type)
                    is_valid = True
                except:
                    is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            if not is_valid:
                print(f"    Sample values: {predication_df[col].head(3).tolist()}")
                if expected_type == int:
                    non_int_mask = ~predication_df[col].astype(str).str.match(r'^-?\d+$', na=False)
                    print(f"    Problematic non-integer values: {predication_df.loc[non_int_mask, col].head(3).tolist()}")
    
    # Check data types for connections_df
    print("\nChecking connections_df data types:")
    for col, expected_type in connections_dtypes.items():
        if col in connections_df.columns:
            actual_type = connections_df[col].dtype
            
            # For ID columns, check if they are strings or can be converted to strings
            if col in [':START_ID', ':END_ID']:
                if pd.api.types.is_string_dtype(actual_type):
                    is_valid = True
                else:
                    # Check if all values can be converted to string without issues
                    try:
                        connections_df[col].astype(str)
                        is_valid = True
                        print(f"    Note: {col} will need conversion to string type")
                    except:
                        is_valid = False
            else:
                # For other columns
                if pd.api.types.is_string_dtype(actual_type) and expected_type == str:
                    is_valid = True
                else:
                    try:
                        connections_df[col].astype(expected_type)
                        is_valid = True
                    except:
                        is_valid = False
            
            print(f"  - {col}: Expected {expected_type.__name__}, Got {actual_type} - {'✓' if is_valid else '✗'}")
            
            # Show sample of values to verify
            print(f"    Sample values: {connections_df[col].head(3).tolist()}")
            
            # For ID columns, analyze value types
            if col in [':START_ID', ':END_ID']:
                num_values = connections_df[col].count()
                num_numeric = connections_df[col].apply(lambda x: isinstance(x, (int, float)) or 
                                                      (isinstance(x, str) and x.isdigit())).sum()
                num_string = num_values - num_numeric
                
                print(f"    Value type breakdown: {num_numeric} numeric ({num_numeric/num_values:.2%}), "
                      f"{num_string} non-numeric ({num_string/num_values:.2%})")
    
    # ===== 2. MISSING VALUES VALIDATION =====
    print("\n===== MISSING VALUES VALIDATION =====")
    
    # Check for missing values in key fields
    print("\nConcept DataFrame Missing Values in Key Fields:")
    missing_concept = concept_df['cui:ID'].isnull().sum()
    print(f"  - Primary key 'cui:ID': {missing_concept} missing values ({missing_concept/len(concept_df):.2%})")
    
    print("\nPredication DataFrame Missing Values in Key Fields:")
    missing_pred_id = predication_df['predication_id:ID'].isnull().sum()
    missing_subject = predication_df['subject_cui:STRING'].isnull().sum()
    missing_object = predication_df['object_cui:STRING'].isnull().sum()
    print(f"  - Primary key 'predication_id:ID': {missing_pred_id} missing values ({missing_pred_id/len(predication_df):.2%})")
    print(f"  - Foreign key 'subject_cui:STRING': {missing_subject} missing values ({missing_subject/len(predication_df):.2%})")
    print(f"  - Foreign key 'object_cui:STRING': {missing_object} missing values ({missing_object/len(predication_df):.2%})")
    
    print("\nConnections DataFrame Missing Values in Key Fields:")
    missing_src = connections_df[':START_ID'].isnull().sum()
    missing_dest = connections_df[':END_ID'].isnull().sum()
    missing_label = connections_df[':TYPE'].isnull().sum()
    print(f"  - ':START_ID': {missing_src} missing values ({missing_src/len(connections_df):.2%})")
    print(f"  - ':END_ID': {missing_dest} missing values ({missing_dest/len(connections_df):.2%})")
    print(f"  - ':TYPE': {missing_label} missing values ({missing_label/len(connections_df):.2%})")
    
    # ===== 3. NODE UNIQUENESS VALIDATION =====
    print("\n===== NODE UNIQUENESS VALIDATION =====")
    
    # Check for duplicate CUIs in concept_df
    duplicate_cuis = concept_df['cui:ID'].duplicated().sum()
    print(f"Duplicate CUIs in concept_df: {duplicate_cuis} ({duplicate_cuis/len(concept_df):.2%})")
    
    # Check for duplicate PREDICATION_IDs in predication_df
    duplicate_preds = predication_df['predication_id:ID'].duplicated().sum()
    print(f"Duplicate PREDICATION_IDs in predication_df: {duplicate_preds} ({duplicate_preds/len(predication_df):.2%})")
    
    # ===== 4. RELATIONSHIP INTEGRITY VALIDATION =====
    print("\n===== RELATIONSHIP INTEGRITY VALIDATION =====")
    
    # Get distinct node IDs from the respective dataframes
    concept_ids = set(concept_df['cui:ID'].dropna().astype(str))
    predication_ids = set(predication_df['predication_id:ID'].dropna().astype(str))
    
    # All valid node IDs (combined)
    all_valid_nodes = concept_ids.union(predication_ids)
    
    # Ensure IDs are strings for comparison
    connections_df['start_id_str'] = connections_df[':START_ID'].astype(str)
    connections_df['end_id_str'] = connections_df[':END_ID'].astype(str)
    
    # Check if relationship src_nodes exist
    src_nodes = set(connections_df['start_id_str'].dropna())
    invalid_src_nodes = src_nodes - all_valid_nodes
    invalid_src_count = len(invalid_src_nodes)
    
    print(f"\nInvalid source nodes in relationships: {invalid_src_count} ({invalid_src_count/len(src_nodes):.2%})")
    if invalid_src_count > 0 and invalid_src_count <= 10:
        print(f"  Sample of invalid source nodes: {list(invalid_src_nodes)[:10]}")
    
    # Check if relationship dest_nodes exist
    dest_nodes = set(connections_df['end_id_str'].dropna())
    invalid_dest_nodes = dest_nodes - all_valid_nodes
    invalid_dest_count = len(invalid_dest_nodes)
    
    print(f"\nInvalid destination nodes in relationships: {invalid_dest_count} ({invalid_dest_count/len(dest_nodes):.2%})")
    if invalid_dest_count > 0 and invalid_dest_count <= 10:
        print(f"  Sample of invalid destination nodes: {list(invalid_dest_nodes)[:10]}")
    
    # ===== 5. SUMMARY =====
    print("\n===== VALIDATION SUMMARY =====")
    
    # Count total issues
    total_issues = (missing_concept + missing_pred_id + missing_subject + missing_object + 
                    missing_src + missing_dest + missing_label +
                    duplicate_cuis + duplicate_preds +
                    invalid_src_count + invalid_dest_count)
    
    if total_issues == 0:
        print("✅ All validations passed! Data appears clean and ready for Neo4j import.")
    else:
        print(f"❌ Found {total_issues} total issues that may affect your Neo4j import.")
        
        # Recommend fixes based on issues found
        print("\nRecommended fixes:")
        
        if missing_concept + missing_pred_id > 0:
            print("  - Remove rows with missing primary keys (cui:ID or predication_id:ID)")
        
        if missing_subject + missing_object > 0:
            print("  - Fix or remove predications with missing subject_cui:STRING or object_cui:STRING")
        
        if missing_src + missing_dest + missing_label > 0:
            print("  - Remove relationships with missing :START_ID, :END_ID, or :TYPE")
        
        if duplicate_cuis > 0:
            print("  - Remove duplicate CUIs or merge their properties")
        
        if duplicate_preds > 0:
            print("  - Remove duplicate predication_id:IDs")
        
        if invalid_src_count + invalid_dest_count > 0:
            print("  - Remove relationships with invalid source or destination nodes")
            
        # Check if START_ID and END_ID need type conversion
        if not pd.api.types.is_string_dtype(connections_df[':START_ID'].dtype) or \
           not pd.api.types.is_string_dtype(connections_df[':END_ID'].dtype):
            print("  - Convert ':START_ID' and ':END_ID' columns to string type (see fix_connections_dtypes function)")

def fix_connections_dtypes(file_path="connections.csv"):
    """
    Fix data types in the connections dataframe:
    - Convert :START_ID and :END_ID to string type
    - Save the fixed version
    """
    # Load the connections CSV
    connections_df = pd.read_csv(file_path, low_memory=False)
    
    # Convert src_node and dest_node to string
    connections_df[':START_ID'] = connections_df[':START_ID'].astype(str)
    connections_df[':END_ID'] = connections_df[':END_ID'].astype(str)
    
    # Save the fixed version
    connections_df.to_csv("connections_fixed.csv", index=False)
    print(f"Fixed connections data saved to 'connections_fixed.csv'")
    return connections_df

validate_semmed_data()

Starting SemMedDB data validation...

Loading data...

===== DATA TYPES VALIDATION =====

Checking concept_df data types:
  - cui:ID: Expected str, Got object - ✓
  - name:STRING: Expected str, Got object - ✓
  - semtype:LABEL: Expected str, Got object - ✓
  - novelty:FLOAT: Expected float, Got float64 - ✓
  - text:STRING: Expected str, Got object - ✓
  - dist:INTEGER: Expected int, Got int64 - ✓
  - maxdist:INTEGER: Expected int, Got int64 - ✓
  - start_index:INTEGER: Expected int, Got int64 - ✓
  - end_index:INTEGER: Expected int, Got int64 - ✓
  - score:INTEGER: Expected int, Got int64 - ✓

Checking predication_df data types:
  - predication_id:ID: Expected int, Got int64 - ✓
  - sentence_id:INTEGER: Expected int, Got int64 - ✓
  - pmid:STRING: Expected str, Got int64 - ✓
  - predicate:LABEL: Expected str, Got object - ✓
  - subject_cui:STRING: Expected str, Got object - ✓
  - object_cui:STRING: Expected str, Got object - ✓
  - indicator_type:STRING: Expected str, Got object - ✓
  -