# Total number of people with disturbs

In [None]:
SELECT  (COUNT (DISTINCT ?person) as ?peopleCount) 
WHERE { 
  ?person wdt:P31 wd:Q5. 
  ?person wdt:P1050 ?disorder.
  ?disorder wdt:P279+ wd:Q12135.
}  

Result is 2,387

# Total number of people with disturbs for which an occupation is specified

In [None]:
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT  (COUNT (DISTINCT ?person) as ?peopleCount) 
WHERE { 
  ?person wdt:P31 wd:Q5. 
  ?person wdt:P106 ?occupation.
  ?person wdt:P1050 ?disorder.
  ?disorder wdt:P279+ wd:Q12135.
}  

Result is 2,268

# Creatives

In [None]:
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?creativePerson ?occupationLabel ?disorderLabel 
WHERE {
  ?creativePerson wdt:P31 wd:Q5.
  ?creativePerson wdt:P106 ?occupation.
  ?occupation rdfs:label ?occupationLabel.
  {VALUES ?occupation { wd:Q205375 wd:Q3492227 wd:Q482980 wd:Q483501  }} 
  UNION {?occupation wdt:P279+ wd:Q205375}  
  UNION {?occupation wdt:P279+ wd:Q3492227} 
  UNION {?occupation wdt:P279+ wd:Q482980} 
  UNION {?occupation wdt:P279+ wd:Q483501}
  
  MINUS {?occupation wdt:P279+ wd:Q15980158} # non-fiction writer
  MINUS {?occupation wdt:P279+ wd:Q108289407} # authors, journalists, linguists
  MINUS {?occupation wdt:P279+ wd:Q1607826} # editor
  MINUS {?occupation wdt:P279+ wd:Q2675537} # music teacher
  MINUS {?occupation wdt:P279+ wd:Q4610556} # model
  MINUS {?occupation wdt:P279+ wd:Q109459317} # content creator
  MINUS {?occupation wdt:P279+ wd:Q138858} # entertainer
  MINUS {?occupation wdt:P279+ wd:Q13474373} # professional wrestler
  MINUS {?occupation wdt:P279+ wd:Q2066131} # athlete
  MINUS {?occupation wdt:P279+ wd:Q947873} # television presenter
  MINUS {?occupation wdt:P279+ wd:Q2405480} # voice actor
  MINUS {?occupation wdt:P279+ wd:Q852857} # sex worker
  MINUS {?occupation wdt:P279+ wd:Q55187} # hairdresser
  MINUS {?occupation wdt:P279+ wd:Q1294787} # artisan
  MINUS {?occupation wdt:P279+ wd:Q13235160} # manufacturer
  MINUS {?occupation wdt:P279+ wd:Q755070} # narrator
  MINUS {?occupation wdt:P279+ wd:Q115460164} # internet user
  MINUS {?occupation wdt:P279+ wd:Q1907198} # taxonomist
  MINUS {?occupation wdt:P279+ wd:Q1734662} # cartographer
  MINUS {?occupation wdt:P279+ wd:Q183888} # software developer
  MINUS {?occupation wdt:P279+ wd:Q111263847} # digital creator
  MINUS {?occupation wdt:P279+ wd:Q1790480} # translator-interpreter
  MINUS {?occupation wdt:P279+ wd:Q47541952} # producer
  ?creativePerson wdt:P1050 ?disorder.
   ?disorder wdt:P279+ wd:Q12135.
   ?disorder rdfs:label ?disorderLabel.
   FILTER (lang(?disorderLabel) = "en")
   FILTER (LANG(?occupationLabel) = "en") .
}

Results in 'creativiID.tsv'

In [3]:
from collections import Counter
import csv

def analyze_creative_persons(tsv_file_path):
    # Initialize Counter
    creative_person_counts = Counter()
    total_rows = 0
    
    # Read TSV file
    with open(tsv_file_path, 'r', encoding='utf-8') as file:
        # Create TSV reader
        tsv_reader = csv.DictReader(file, delimiter='\t')
        
        # Count each creative person occurrence and total rows
        for row in tsv_reader:
            creative_person = row['?creativePerson']
            creative_person_counts[creative_person] += 1
            total_rows += 1
    
    # Get total unique IDs
    total_unique = len(creative_person_counts)
    
    # Count how many IDs appear multiple times
    repeated_ids = sum(1 for count in creative_person_counts.values() if count > 1)
    
    return total_rows, total_unique, repeated_ids

# Example usage
if __name__ == "__main__":
    file_path = "creativiID.tsv"  
    total_rows, total_unique, repeated = analyze_creative_persons(file_path)
    
    print(f"Total number of rows: {total_rows}")
    print(f"Total number of unique creative persons: {total_unique}")
    print(f"Number of creative persons that appear multiple times: {repeated}")

Total number of rows: 4873
Total number of unique creative persons: 1443
Number of creative persons that appear multiple times: 1003


# Non creativi

Prima query: esclude le classi e sottoclassi di creativi

In [None]:
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT distinct ?nonCreativePerson ?occupationLabel ?disorderLabel ?provenance ?gender ?yearOfBirth
WHERE {
  ?nonCreativePerson wdt:P31 wd:Q5.
  ?nonCreativePerson wdt:P106 ?occupation.
  ?occupation rdfs:label ?occupationLabel.
  # Exclude creative professions and their subclasses
  MINUS {
    VALUES ?occupation { 
      wd:Q205375  # inventor
      wd:Q3492227 # innovator
      wd:Q482980  # author
      wd:Q483501  # artist
    }
  }
  
  MINUS {
    ?occupation wdt:P279+ ?excludedOccupation .
    VALUES ?excludedOccupation { 
      wd:Q205375  # inventor
      wd:Q3492227 # innovator
      wd:Q482980  # author
      wd:Q483501  # artist
    }
  }
   ?nonCreativePerson wdt:P1050 ?disorder.
   ?disorder wdt:P279+ wd:Q12135.
   ?disorder rdfs:label ?disorderLabel.
   FILTER (lang(?disorderLabel) = "en")
   FILTER (LANG(?occupationLabel) = "en") .
}

Risultato: nonCreativiID.tsv

## Seconda query: include le sottoclassi di mestieri creativi escluse nelle ricerche sui creativi

In [None]:
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT distinct ?nonCreativePerson ?occupationLabel ?disorderLabel ?provenance ?gender ?yearOfBirth
WHERE {
  ?nonCreativePerson wdt:P31 wd:Q5.
  ?nonCreativePerson wdt:P106 ?occupation.
  ?occupation rdfs:label ?occupationLabel.

  {
    VALUES ?occupation {
      wd:Q15980158 wd:Q108289407 wd:Q1607826 wd:Q2675537 
      wd:Q4610556 wd:Q109459317 wd:Q138858 wd:Q13474373
      wd:Q2066131 wd:Q947873 wd:Q2405480 wd:Q852857
      wd:Q55187 wd:Q1294787 wd:Q13235160 wd:Q755070
      wd:Q115460164 wd:Q1907198 wd:Q1734662 wd:Q183888
      wd:Q111263847 wd:Q1790480 wd:Q47541952
    }
  } UNION {
    ?occupation wdt:P279+ ?includedOccupation .
    VALUES ?includedOccupation {
      wd:Q15980158 wd:Q108289407 wd:Q1607826 wd:Q2675537 
      wd:Q4610556 wd:Q109459317 wd:Q138858 wd:Q13474373
      wd:Q2066131 wd:Q947873 wd:Q2405480 wd:Q852857
      wd:Q55187 wd:Q1294787 wd:Q13235160 wd:Q755070
      wd:Q115460164 wd:Q1907198 wd:Q1734662 wd:Q183888
      wd:Q111263847 wd:Q1790480 wd:Q47541952
    }
  }
  ?nonCreativePerson wdt:P1050 ?disorder.
   ?disorder wdt:P279+ wd:Q12135.
   ?disorder rdfs:label ?disorderLabel.
   FILTER (lang(?disorderLabel) = "en")
   FILTER (LANG(?occupationLabel) = "en") .
}

In [None]:
Risultato: nonCreativiID2.tsv

## A code to merge the two non-creatives files and delete people identified as creative

The code: 
- First merges both non-creative files into one
- Then checks for any IDs that appear in both:
        - the merged non-creative file
        - the creative file
- Removes any rows from the merged non-creative file where the ID appears in the creative file
- Save the result to a new file

In [5]:
import pandas as pd
import logging

def merge_and_clean_files(creative_file: str, noncreative_file1: str, noncreative_file2: str, 
                         output_file: str = 'merged_cleaned_noncreative.tsv') -> None:
    """
    Merge non-creative files and remove duplicates that appear in creative file.
    
    Args:
        creative_file: Path to creativiID.tsv
        noncreative_file1: Path to first nonCreativiID.tsv
        noncreative_file2: Path to second nonCreativiID2.tsv
        output_file: Path for the merged and cleaned output file
    """
    try:
        # Read all TSV files
        creative_df = pd.read_csv(creative_file, sep='\t')
        noncreative_df1 = pd.read_csv(noncreative_file1, sep='\t')
        noncreative_df2 = pd.read_csv(noncreative_file2, sep='\t')
        
        # Step 1: Merge non-creative files
        merged_df = pd.concat([noncreative_df1, noncreative_df2], axis=0)
        print(f"\nInitial merge:")
        print(f"- Rows from first non-creative file: {len(noncreative_df1)}")
        print(f"- Rows from second non-creative file: {len(noncreative_df2)}")
        print(f"- Total rows after merge: {len(merged_df)}")
        
        # Get unique IDs from creative file
        creative_ids = set(creative_df['?creativePerson'])
        
        # Find duplicate IDs
        duplicate_ids = set(merged_df['?nonCreativePerson']).intersection(creative_ids)
        
        # Remove rows with duplicate IDs from merged file
        cleaned_df = merged_df[~merged_df['?nonCreativePerson'].isin(duplicate_ids)]
        
        # Save the cleaned merged file
        cleaned_df.to_csv(output_file, sep='\t', index=False)
        
        # Print summary
        print(f"\nDuplicate removal:")
        print(f"- Found {len(duplicate_ids)} IDs that appear in both creative and non-creative files")
        if duplicate_ids:
            print("\nDuplicate IDs removed:")
            for id in sorted(duplicate_ids):
                print(f"- {id}")
        
        print(f"\nFinal results:")
        print(f"- Original total rows: {len(merged_df)}")
        print(f"- Rows after removing duplicates: {len(cleaned_df)}")
        print(f"- Rows removed: {len(merged_df) - len(cleaned_df)}")
        print(f"- Output saved to: {output_file}")
        
    except Exception as e:
        logging.error(f"Error processing files: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        # File paths
        creative_file = 'creativiID.tsv'
        noncreative_file1 = 'nonCreativiID.tsv'
        noncreative_file2 = 'nonCreativiID2.tsv'
        output_file = 'merged_cleaned_noncreative.tsv'
        
        merge_and_clean_files(creative_file, noncreative_file1, noncreative_file2, output_file)
        
    except Exception as e:
        print(f"Error: {str(e)}")


Initial merge:
- Rows from first non-creative file: 2169
- Rows from second non-creative file: 2388
- Total rows after merge: 4557

Duplicate removal:
- Found 758 IDs that appear in both creative and non-creative files

Duplicate IDs removed:
- <http://www.wikidata.org/entity/Q100292045>
- <http://www.wikidata.org/entity/Q100583464>
- <http://www.wikidata.org/entity/Q100595699>
- <http://www.wikidata.org/entity/Q1006152>
- <http://www.wikidata.org/entity/Q101424804>
- <http://www.wikidata.org/entity/Q102046737>
- <http://www.wikidata.org/entity/Q102374>
- <http://www.wikidata.org/entity/Q102423299>
- <http://www.wikidata.org/entity/Q102870>
- <http://www.wikidata.org/entity/Q10314466>
- <http://www.wikidata.org/entity/Q10321758>
- <http://www.wikidata.org/entity/Q1035>
- <http://www.wikidata.org/entity/Q104358>
- <http://www.wikidata.org/entity/Q104524450>
- <http://www.wikidata.org/entity/Q104604510>
- <http://www.wikidata.org/entity/Q104678350>
- <http://www.wikidata.org/entity/Q104

In [9]:
from collections import Counter
import csv

def analyze_creative_persons(tsv_file_path):
    # Initialize Counter
    creative_person_counts = Counter()
    total_rows = 0
    
    # Read TSV file
    with open(tsv_file_path, 'r', encoding='utf-8') as file:
        # Create TSV reader
        tsv_reader = csv.DictReader(file, delimiter='\t')
        
        # Count each creative person occurrence and total rows
        for row in tsv_reader:
            creative_person = row['?nonCreativePerson']
            creative_person_counts[creative_person] += 1
            total_rows += 1
    
    # Get total unique IDs
    total_unique = len(creative_person_counts)
    
    # Count how many IDs appear multiple times
    repeated_ids = sum(1 for count in creative_person_counts.values() if count > 1)
    
    return total_rows, total_unique, repeated_ids

# Example usage
if __name__ == "__main__":
    file_path = "merged_cleaned_noncreative.tsv" 
    total_rows, total_unique, repeated = analyze_creative_persons(file_path)
    
    print(f"Total number of rows: {total_rows}")
    print(f"Total number of unique non-creative persons: {total_unique}")
    print(f"Number of non-creative persons that appear multiple times: {repeated}")

Total number of rows: 2088
Total number of unique non-creative persons: 823
Number of non-creative persons that appear multiple times: 566


Total of creatives with disturbs (1443) and non-creatives with disturbs (823) sums up to 2266.The missing people do not have an occupation specified. 

# Add gender, provenance, year of birth via a remote query to creatives

In [14]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm import tqdm
import time
import re

def extract_wikidata_id(uri):
    """Extract Q-identifier from full Wikidata URI"""
    match = re.search(r'Q\d+', uri)
    return match.group(0) if match else None

def chunk_list(lst, chunk_size):
    """Split list into chunks of specified size"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def create_query(wikidata_ids):
    """Create SPARQL query for multiple entities with optional properties"""
    # Create VALUES clause with wd: prefix for each Q-ID
    entities = ' '.join(f'(wd:{id_})' for id_ in wikidata_ids)
    
    query = f"""
    SELECT DISTINCT ?entity ?genderLabel ?provenanceLabel ?yearOfBirth
    WHERE {{
        VALUES (?entity) {{ {entities} }}
        
        OPTIONAL {{
            ?entity wdt:P21 ?gender.
            ?gender rdfs:label ?genderLabel.
            FILTER(LANG(?genderLabel) = "en")
        }}
        
        OPTIONAL {{
            ?entity wdt:P27 ?provenance.
            ?provenance rdfs:label ?provenanceLabel.
            FILTER(LANG(?provenanceLabel) = "en")
        }}
        
        OPTIONAL {{
            ?entity wdt:P569 ?dateOfBirth.
            BIND(YEAR(?dateOfBirth) AS ?yearOfBirth)
        }}
    }}
    """
    return query

def query_wikidata(sparql, ids):
    """Query Wikidata and return results"""
    query = create_query(ids)
    sparql.setQuery(query)
    
    try:
        results = sparql.query().convert()
        return results['results']['bindings']
    except Exception as e:
        print(f"Error querying Wikidata: {e}")
        return []

def process_results(results):
    """Process SPARQL results into a list of dictionaries"""
    processed = []
    for result in results:
        processed.append({
            'entity': result['entity']['value'],
            'gender': result.get('genderLabel', {}).get('value', None),
            'provenance': result.get('provenanceLabel', {}).get('value', None),
            'year_of_birth': result.get('yearOfBirth', {}).get('value', None)
        })
    return processed

def main():
    # Initialize SPARQL endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader('User-Agent', 'MyWikidataBot/1.0')
    
    # Read TSV file - replace 'your_file.tsv' with actual filename
    # and 'wikidata_column' with the actual column name containing Wikidata IDs
    df = pd.read_csv('creativiID.tsv', sep='\t')
    
    # Extract unique Wikidata IDs and clean them
    wikidata_uris = df['?creativePerson'].unique()
    wikidata_ids = [extract_wikidata_id(uri) for uri in wikidata_uris if pd.notna(uri)]
    wikidata_ids = [id_ for id_ in wikidata_ids if id_ is not None]
    
    print(f"Found {len(wikidata_ids)} unique Wikidata IDs")
    
    # Process in chunks of 50 entities
    CHUNK_SIZE = 50
    chunks = chunk_list(wikidata_ids, CHUNK_SIZE)
    
    all_results = []
    
    # Process each chunk with progress bar
    for chunk in tqdm(chunks, desc="Processing Wikidata entities"):
        results = query_wikidata(sparql, chunk)
        processed_results = process_results(results)
        all_results.extend(processed_results)
        
        # Add delay to avoid hitting rate limits
        time.sleep(1)
    
    # Convert results to DataFrame and save
    results_df = pd.DataFrame(all_results)
    results_df.to_csv('wikidata_results.csv', index=False)
    print(f"Processed {len(all_results)} entities. Results saved to wikidata_results.csv")

if __name__ == "__main__":
    main()

Found 1443 unique Wikidata IDs


Processing Wikidata entities: 100%|████████████████████████████████████████████████████| 29/29 [01:02<00:00,  2.16s/it]

Processed 1634 entities. Results saved to wikidata_results.csv





# Add gender, provenance, year of birth via a remote query to non-creatives

In [15]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm import tqdm
import time
import re

def extract_wikidata_id(uri):
    """Extract Q-identifier from full Wikidata URI"""
    match = re.search(r'Q\d+', uri)
    return match.group(0) if match else None

def chunk_list(lst, chunk_size):
    """Split list into chunks of specified size"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def create_query(wikidata_ids):
    """Create SPARQL query for multiple entities with optional properties"""
    # Create VALUES clause with wd: prefix for each Q-ID
    entities = ' '.join(f'(wd:{id_})' for id_ in wikidata_ids)
    
    query = f"""
    SELECT DISTINCT ?entity ?genderLabel ?provenanceLabel ?yearOfBirth
    WHERE {{
        VALUES (?entity) {{ {entities} }}
        
        OPTIONAL {{
            ?entity wdt:P21 ?gender.
            ?gender rdfs:label ?genderLabel.
            FILTER(LANG(?genderLabel) = "en")
        }}
        
        OPTIONAL {{
            ?entity wdt:P27 ?provenance.
            ?provenance rdfs:label ?provenanceLabel.
            FILTER(LANG(?provenanceLabel) = "en")
        }}
        
        OPTIONAL {{
            ?entity wdt:P569 ?dateOfBirth.
            BIND(YEAR(?dateOfBirth) AS ?yearOfBirth)
        }}
    }}
    """
    return query

def query_wikidata(sparql, ids):
    """Query Wikidata and return results"""
    query = create_query(ids)
    sparql.setQuery(query)
    
    try:
        results = sparql.query().convert()
        return results['results']['bindings']
    except Exception as e:
        print(f"Error querying Wikidata: {e}")
        return []

def process_results(results):
    """Process SPARQL results into a list of dictionaries"""
    processed = []
    for result in results:
        processed.append({
            'entity': result['entity']['value'],
            'gender': result.get('genderLabel', {}).get('value', None),
            'provenance': result.get('provenanceLabel', {}).get('value', None),
            'year_of_birth': result.get('yearOfBirth', {}).get('value', None)
        })
    return processed

def main():
    # Initialize SPARQL endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader('User-Agent', 'MyWikidataBot/1.0')
    
    # Read TSV file - replace 'your_file.tsv' with actual filename
    # and 'wikidata_column' with the actual column name containing Wikidata IDs
    df = pd.read_csv('merged_cleaned_noncreative.tsv', sep='\t')
    
    # Extract unique Wikidata IDs and clean them
    wikidata_uris = df['?nonCreativePerson'].unique()
    wikidata_ids = [extract_wikidata_id(uri) for uri in wikidata_uris if pd.notna(uri)]
    wikidata_ids = [id_ for id_ in wikidata_ids if id_ is not None]
    
    print(f"Found {len(wikidata_ids)} unique Wikidata IDs")
    
    # Process in chunks of 50 entities
    CHUNK_SIZE = 50
    chunks = chunk_list(wikidata_ids, CHUNK_SIZE)
    
    all_results = []
    
    # Process each chunk with progress bar
    for chunk in tqdm(chunks, desc="Processing Wikidata entities"):
        results = query_wikidata(sparql, chunk)
        processed_results = process_results(results)
        all_results.extend(processed_results)
        
        # Add delay to avoid hitting rate limits
        time.sleep(1)
    
    # Convert results to DataFrame and save
    results_df = pd.DataFrame(all_results)
    results_df.to_csv('wikidata_results_non_creatives.csv', index=False)
    print(f"Processed {len(all_results)} entities. Results saved to wikidata_results.csv")

if __name__ == "__main__":
    main()

Found 823 unique Wikidata IDs


Processing Wikidata entities: 100%|████████████████████████████████████████████████████| 17/17 [00:33<00:00,  1.99s/it]

Processed 917 entities. Results saved to wikidata_results.csv





# Merge new info in creatives

In [17]:
import pandas as pd

# Read the files
creative_df = pd.read_csv('creativiID.tsv', sep='\t')
wikidata_df = pd.read_csv('wikidata_results.csv')

# Remove angle brackets from the creativePerson column
creative_df['?creativePerson'] = creative_df['?creativePerson'].str.strip('<>')

# Merge the dataframes
merged_df = creative_df.merge(
    wikidata_df,
    left_on='?creativePerson',
    right_on='entity',
    how='left'
)

# Drop the 'entity' column as it's duplicate of '?creativePerson'
merged_df = merged_df.drop('entity', axis=1)

# Save to new TSV file
merged_df.to_csv('merged_results.tsv', sep='\t', index=False)

# Print some stats
total_rows = len(creative_df)
matched_rows = merged_df[merged_df['gender'].notna()].shape[0]
print(f"Total rows in original file: {total_rows}")
print(f"Rows with matched Wikidata information: {matched_rows}")
print(f"Percentage matched: {(matched_rows/total_rows)*100:.2f}%")

Total rows in original file: 4873
Rows with matched Wikidata information: 5567
Percentage matched: 114.24%


In [23]:
import pandas as pd

# Read the files
tsv_data = pd.read_csv('creativiID.tsv', sep='\t')
csv_data = pd.read_csv('wikidata_results.csv')

# Strip <> from creativePerson in TSV and entity in CSV
tsv_data['?creativePerson'] = tsv_data['?creativePerson'].str.replace('<', '').str.replace('>', '')
csv_data['entity'] = csv_data['entity'].str.replace('<', '').str.replace('>', '')

# Merge files based on creativePerson and entity
merged_data = pd.merge(
    tsv_data, 
    csv_data,
    left_on='?creativePerson',
    right_on='entity',
    how='left'
)

# Save as TSV
merged_data.to_csv('merged_results.tsv', sep='\t', index=False)

print("Files merged successfully!")
print(f"Found {len(merged_data)} rows in merged file")

Files merged successfully!
Found 5579 rows in merged file


# Merge new info in non-creatives

In [24]:
import pandas as pd

# Read the files
tsv_data = pd.read_csv('merged_cleaned_noncreative.tsv', sep='\t')
csv_data = pd.read_csv('wikidata_results_non_creatives.csv')

# Strip <> from creativePerson in TSV and entity in CSV
tsv_data['?nonCreativePerson'] = tsv_data['?nonCreativePerson'].str.replace('<', '').str.replace('>', '')
csv_data['entity'] = csv_data['entity'].str.replace('<', '').str.replace('>', '')

# Merge files based on creativePerson and entity
merged_data = pd.merge(
    tsv_data, 
    csv_data,
    left_on='?nonCreativePerson',
    right_on='entity',
    how='left'
)

# Save as TSV
merged_data.to_csv('merged_results_non_creatives.tsv', sep='\t', index=False)

print("Files merged successfully!")
print(f"Found {len(merged_data)} rows in merged file")

Files merged successfully!
Found 2316 rows in merged file
