In [29]:
import pandas as pd

df = pd.read_csv('../data/finalized_df.tsv', sep='\t')
df.columns

Index(['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'time_of_day', 'Witness Count', 'Witness Reasoning',
       'tokenized', 'pos_tokenized', 'filtered_tokenized',
       'lemma_filtered_tokenized', 'lemma_pos_tokenized', 'Event',
       'apparition_types', 'apparition_descriptors', 'apparition_gender',
       'apparition_age', 'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions', 'adjectives',
       'unique_app_descriptor_mentions', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning',
       'death_rate_Alzheimer's disease', 'death_rate_Cancer'

## Match and Extract Keyword

In [30]:
def extract_keywords(text, keywords):
    """Extract keywords from the text."""
    text = text.lower()  # Convert to lowercase
    keywords = [str(word) for word in keywords if pd.notna(word)]
    matches = [word for word in keywords if re.search(r'\b' + word + r'(s)?\b', text)]
    return matches

## Keyword Bank File Download

In [27]:
#Reading the Keyword bank
keyword_bank_df = pd.read_csv('../data/keywords_dictionary_v2.csv', encoding = 'latin1')

### Apparition Descriptors

In [16]:
print(df['adjectives'].head())

0    [('misty', 'figure'), ('blue', 'figure')]
1                                           []
2                                           []
3                                           []
4                                           []
Name: adjectives, dtype: object


This was an attempt to  take the adjectives associated to apparitions and extract out the unique keywords that could be used in the keyword bank "Apparition_Adjectives". 

df['adjectives'] represents the adj-apparition pairs where the adj are +/- 1to2 words from a matched apparition. 

In [76]:
# Check the type of each value in the 'adjectives' column
# downloading the TSV file converted the data type to strings -> later on the column is converted back to tuples in lists
df['adjectives'].apply(type)


0        <class 'list'>
1        <class 'list'>
2        <class 'list'>
3        <class 'list'>
4        <class 'list'>
              ...      
10975    <class 'list'>
10976    <class 'list'>
10977    <class 'list'>
10978    <class 'list'>
10979    <class 'list'>
Name: adjectives, Length: 10980, dtype: object

In [66]:
import ast

# Assuming 'df['adjectives']' contains string representations of lists of tuples
df['adjectives'] = df['adjectives'].apply(lambda x: ast.literal_eval(x))

# Now you can extract the adjectives as you did before
adjectives = [pair[0] for sublist in df['adjectives'] for pair in sublist if pair]

# Get the unique set of adjectives
unique_adjectives = set(adjectives)

# Display the unique adjectives
print(unique_adjectives)


{'popular', 't', 'transparent', 'running', 'last', 'next', 'weird', 'abandoned', 'different', 'recent', 'manly', 'heavy', 'big', 'rumored', 'wonderful', 'black', 'supernatural', 'unknown', 'certain', 'nail', 'low', 'inexperienced', 'greenish', 'malignant', 'active', 'bride', 'bad', 'menacing', 'wild', 'distinct', 'shaowy', 'Third', 'negative', 'Wiccademous', 'dark', 'main', 'phantom', 'green', 'smoky', 'mischievous', 'bodily', 'enormous', 'attic', 'Male', 'good', 'nude', 'invisible', 'classic', 'same', 'boy', 'hostile', 'ill', 'dead', 'nameless', 'start', 'whit', 'cold', 'orbs', 'public', 'portal', 'whole', 'playful', 'metaphysical', 'benign', 'upper', 'hooded', 'early', 'several', 'unhappy', 'blue', 'careful', 'lonely', 'male', 'lingering', 'furry', 'rushes', 'freaky', 'dramatic', 'vengeful', 'full', 'responsible', 'dense', 'unexplained', 'unseen', 'random', 'charred', 'few', 'multiple', 'common', 'stronger', 'hazy', 'murdered', 'tan', 'demonic', 'rare', 'happy', 'horrible', 'foggy', 

In [71]:
#Search to see if certain ADJ made it in the set
if "physical" in unique_adjectives:
    print("True")
else:
    print("False")

True


In [67]:
print(adjectives)

['misty', 'blue', 'strange', 'ghostly', 'unpleasant', 'cold', 'large', 'greenish', 'extra', 'little', 'odd', 'real', 'boy', 'tall', 'other', 'nail', 'male', 'male', 'eerie', 'dormitory', 'strange', 'blue', 'evil', 'shadowy', 'additional', 'dark', 'shadowy', 'large', 'eerie', 'white', 'disembodied', 'strange', 'black', 'attic', 'heavy', 'manly', 'such', 'red', 'strong', 'active', 'active', 'ghostly', 'red', 'numerous', 'free', 'many', 'ghostly', 'black', 'black', 'white', 'large', 'black', 'strange', 'black', 'blue', 'tall', 'dark', 'strange', 'friendly', 'red', 'black', 'torso', 'female', 'strange', 'shadowy', 'famous', 'short', 'large', 'white', 'likable', 'young', 'personal', 'faint', 'white', 'vengeful', 'strong', 'shadowy', 'Indian', 'nun', 'unknown', 'tall', 'second', 'last', 'white', 'bad', 'black', 'ghostly', 'tall', 'white', 'ghostly', 'only', 'mysterious', 'several', 'covered', 'orbs', 'helpful', 'full', 'Indian', 'Other', 'ghostly', 'protective', 'many', 'dust', 'prominent', 

In [72]:
# Save the unique adjectives to a text file to be stored in keyword bank
with open('unique_adjectives.txt', 'w') as file:
    for adjective in unique_adjectives:
        file.write(f"{adjective}\n")  # Write each adjective on a new line


The below is an attempt to look at the unique_adjectives and filter for any words that are ADJ as an individual word using the small English spaCy model.

Since I will keyword match specific word, I only wanted word tokens that were ADJ by themselves. Some of the results weren't 100% accurate like false positive "Wiccademous" which is a proper noun and a false negative "misty" which is an adj which should be included

In [69]:
import spacy

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Process each word separately and filter adjectives
adjective_words = [word for word in unique_adjectives if nlp(word)[0].pos_ == 'ADJ']

print(adjective_words)  # Print the list of adjectives


['popular', 'transparent', 'last', 'next', 'weird', 'different', 'recent', 'heavy', 'big', 'wonderful', 'black', 'unknown', 'certain', 'low', 'inexperienced', 'malignant', 'active', 'bad', 'menacing', 'wild', 'Third', 'negative', 'Wiccademous', 'dark', 'main', 'smoky', 'mischievous', 'enormous', 'attic', 'good', 'nude', 'invisible', 'same', 'hostile', 'dead', 'cold', 'whole', 'playful', 'metaphysical', 'upper', 'several', 'unhappy', 'blue', 'careful', 'lonely', 'dramatic', 'vengeful', 'full', 'responsible', 'dense', 'unexplained', 'unseen', 'random', 'few', 'common', 'stronger', 'hazy', 'demonic', 'rare', 'happy', 'horrible', 'foggy', 'elemental', 'sincere', 'nervous', 'silent', 'Strong', 'Strange', 'local', 'many', 'aggressive', 'short', 'empty', 'bluish', 'red', 'large', 'restless', 'extra', 'infamous', 'Other', 'occassional', 'professional', 'bright', 'separate', 'mad', 'strange', 'thick', 'helpful', 'open', 'constant', 'visible', 'physical', 'strong', 'positive', 'general', 'final'

In [36]:
# Save the unique adjectives to a text file
with open('unique_adjectives_ADJ.txt', 'w') as file:
    for adjective in adjective_words:
        file.write(f"{adjective.lower()}\n")  # Write each adjective on a new line

In [87]:
kw_df = pd.read_csv('keywords_dictionary_v2.csv')

with open('unique_adjectives_ADJ.txt', "r", encoding="utf-8") as file:
    words = file.read().splitlines()

kw_df["Apparition_Adjectives"] = words + [""] * (len(kw_df) - len(words))  # Fill missing rows

kw_df.to_csv('../data/keywords_dictionary_v2.csv', index = False)
print(kw_df.head())

             Audio_Evidence Image/Video/Visual Evidence  Haunted Places Date  \
0           Audio Forensics                     video                    NaN   
1          Chain of Custody                  metadata                    NaN   
2  Expert Witness Testimony            authentication                    NaN   
3                  Metadata                  artifact                    NaN   
4           Noise Reduction                     frame                    NaN   

   Time of day Apparition_Types Apparition_Descriptors Apparition_Gender  \
0          NaN            ghost               faceless               man   
1          NaN           spirit         affectionately               men   
2          NaN       apparition               residing               guy   
3          NaN           shadow               heavyset               boy   
4          NaN          phantom               wrestler         gentlemen   

  Apparition_Age Apparition_Adjectives Event: Murder       Eve

In [88]:
kw_df.columns


Index(['Audio_Evidence', 'Image/Video/Visual Evidence', 'Haunted Places Date',
       'Time of day', 'Apparition_Types', 'Apparition_Descriptors',
       'Apparition_Gender', 'Apparition_Age', 'Apparition_Adjectives',
       'Event: Murder', 'Event: Death', 'Event: Supernatural',
       'Event: Natural Disaster', 'Event: Suicide'],
      dtype='object')

In [102]:
#Spot check if a keyword is in the Adj keyword bank
if "wiccademous" in kw_df["Apparition_Adjectives"].values:
    print("Value exists in the column.")
else:
    print("Value not found.")

print(kw_df['Apparition_Adjectives'])

Value not found.
0          popular
1      transparent
2             last
3             next
4            weird
          ...     
191               
192               
193               
194               
195               
Name: Apparition_Adjectives, Length: 196, dtype: object


In [94]:
#Clean up the keywords that are false positive or false negatives
delete_word = "wiccademous"
kw_df["Apparition_Adjectives"].replace(delete_word, "")

if delete_word in kw_df["Apparition_Adjectives"].values:
    print("Not complete")
else:
    print(f"deleted {delete_word}")

deleted wiccademous


In [104]:
empty_row_index = kw_df[kw_df["Apparition_Adjectives"] == ""].index.min()
print(empty_row_index)

184


In [103]:
#At the identified empty_row_index, insert a word that was a false negative and wasn't included on first go around.
kw_df.at[empty_row_index, 'Apparition_Adjectives'] = 'unidentified'

In [105]:
kw_df.to_csv('../data/keywords_dictionary_v2.csv', index=False)

This was to get return the adjective-apparition pair only if the adjective existed in the filtered down list of adjectives using the spaCy model.

This is to only select adj-apparition pairs if the adj is in the keyword bank. Might be too selective if you want edge case references for GenAI to go off of. GenAI might prefer higher recall.

Note that Apparition_Descriptors is different and is archived because the descriptors can from generating descriptors from scratch while Adjectives looks at adj nearby apparitions

In [106]:
adj_app_kw = [keyword for keyword in keyword_bank_df['Apparition_Adjectives'] if keyword != []]
print(adj_app_kw)

df['adj_apparition_kw'] = df['adjectives'].apply(
    lambda tuples_list: ", ".join([f"{adj} {apparition}" for adj, apparition in tuples_list if adj in adj_app_kw])
)
print(df['adj_apparition_kw'])



['physical', 'violent', 'nasty', 'past', 'single', 'strong', 'insane', 'malignant', 'popular', 'numerous', 'old', 'positive', 'likable', 'unpleasant', 'greatest', 'malicious', 'strange', 'inexperienced', 'occassional', 'notorious', 'high', 'nice', 'elemental', nan, 'ominous', 'interested', 'former', 'big', 'thick', 'unexplainable', 'unseen', nan, 'dark', 'spiritual', 'cold', 'recent', 'directional', 'mad', 'large', 'same', 'certain', 'foggy', 'neutral', nan, nan, 'dangerous', nan, 'low', 'third', 'local', 'protective', 'menacing', 'enormous', 'negative', 'personal', 'hawaiian', 'professional', 'vengeful', 'suspicious', 'lonely', 'short', nan, 'playful', 'real', 'faint', 'electrical', 'various', 'visible', 'black', 'last', 'unwanting', 'demonic', 'common', 'particular', 'aggressive', 'random', 'careful', 'open', 'actual', 'third', 'invisible', 'sincere', 'heavy', 'happy', 'few', 'full', 'horrible', 'thin', 'stronger', 'hateful', 'many', 'wild', 'good', 'restless', 'nervous', 'much', 'fi

Note for  'adj_apparition_kw': This feature wasn't added into the dataset since it wasn't deemed necessary to filter down. The recall of what the word embedding adj-apparition pair picked up was more important than accuracy

This just does a straight conversion of the adjective-apparition pair into a string.

This preserves all the keywords matched one for a more liberal input for GenAI Images

In [107]:
df['apparition_adj_str'] = df['adjectives'].apply(
    lambda tuples_list: ", ".join([f"{adj} {apparition}" for adj, apparition in tuples_list if tuples_list != []])
)
print(df['apparition_adj_str'])


0        misty figure, blue figure
1                                 
2                                 
3                                 
4                                 
                   ...            
10975                             
10976                             
10977              ear Poltergeist
10978                    few ghost
10979                             
Name: apparition_adj_str, Length: 10980, dtype: object


In [108]:
print(df.columns)

Index(['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'Witness Count', 'Witness Reasoning', 'tokenized',
       'pos_tokenized', 'filtered_tokenized', 'lemma_filtered_tokenized',
       'lemma_pos_tokenized', 'Event', 'apparition_types',
       'apparition_descriptors', 'apparition_gender', 'apparition_age',
       'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions', 'adjectives', 'apparition_adj_str',
       'unique_app_descriptor_mentions', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning',
       'death_rate_Alzheimer's disease', 'death_rate_

In [110]:
#Update of the keyword match for apparition_descriptors - Removed the clearly wrong "presence" as a descriptor which 
#This keyword match is archived because the adjective-apparition descriptor pair is a better approach to finding apparition descriptions
import re

keyword_bank_df = pd.read_csv('../data/keywords_dictionary_v2.csv')
df['apparition_descriptors'] = df['lemma_pos_tokenized'].apply(lambda x: extract_keywords(str(x), keyword_bank_df['Apparition_Descriptors']))

print(df['apparition_descriptors'].head(10))


0    [ghostly, misty]
1          [restless]
2                  []
3                  []
4                  []
5                  []
6                  []
7                  []
8                  []
9           [ghostly]
Name: apparition_descriptors, dtype: object


In [59]:
df.columns

Index(['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'time_of_day', 'Witness Count', 'Witness Reasoning',
       'tokenized', 'pos_tokenized', 'filtered_tokenized',
       'lemma_filtered_tokenized', 'lemma_pos_tokenized', 'Event',
       'apparition_types', 'apparition_descriptors', 'apparition_gender',
       'apparition_age', 'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions', 'adjectives',
       'unique_app_descriptor_mentions', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning',
       'death_rate_Alzheimer's disease', 'death_rate_Cancer'

In [111]:
desired_order = ['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'Witness Count', 'Witness Reasoning',
       'tokenized', 'pos_tokenized', 'filtered_tokenized',
       'lemma_filtered_tokenized', 'lemma_pos_tokenized', 'Event',
       'apparition_types', 'apparition_descriptors', 'apparition_gender',
       'apparition_age', 'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions','unique_app_descriptor_mentions','adjectives', 'apparition_adj_str', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning',
       'death_rate_Alzheimer\'s disease', 'death_rate_Cancer',
       'death_rate_Heart disease', 'death_rate_Unintentional injuries',
       'death_rate_All causes', 'death_rate_Influenza and pneumonia',
       'death_rate_Suicide', 'death_rate_Kidney disease', 'death_rate_CLRD',
       'death_rate_Diabetes', 'Haunted Places Date', 'Alcohol Deaths',
       'Alcohol Deaths Under 21', 'State', 'time_of_day', 'Daylight Data USNO Navy',
       'Daylight Data TimeandDate']

df = df[desired_order]

In [112]:
df.columns

Index(['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'Witness Count', 'Witness Reasoning', 'tokenized',
       'pos_tokenized', 'filtered_tokenized', 'lemma_filtered_tokenized',
       'lemma_pos_tokenized', 'Event', 'apparition_types',
       'apparition_descriptors', 'apparition_gender', 'apparition_age',
       'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions', 'unique_app_descriptor_mentions',
       'adjectives', 'apparition_adj_str', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning',
       'death_rate_Alzheimer's disease', 'death_rate_

In [113]:
df.to_csv('../data/v2.csv')

In [119]:
df.to_csv('../data/v2.tsv', sep="\t") 

In [114]:
df.head(20)

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,...,death_rate_Kidney disease,death_rate_CLRD,death_rate_Diabetes,Haunted Places Date,Alcohol Deaths,Alcohol Deaths Under 21,State,time_of_day,Daylight Data USNO Navy,Daylight Data TimeandDate
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,...,15.136842,44.742105,25.236842,1/14/2013,2208.0,3.9%,Michigan,Dusk,09:05,No data found
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,...,15.136842,44.742105,25.236842,1/1/2025,2208.0,3.9%,Michigan,Dusk,11:22,No data found
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,...,15.136842,44.742105,25.236842,9/22/2006,2208.0,3.9%,Michigan,Evening,11:16,No data found
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,...,15.136842,44.742105,25.236842,3/1/1900,2208.0,3.9%,Michigan,Unknown,11:45,No data found
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,...,15.136842,44.742105,25.236842,1/1/2025,2208.0,3.9%,Michigan,Evening,09:10,No data found
5,Albion,United States,A mysterious lady in white has been spotted in...,Riverside Cemetery,Michigan,MI,-84.753056,42.236814,-84.75303,42.243097,...,15.136842,44.742105,25.236842,1/1/2025,2208.0,3.9%,Michigan,Unknown,09:10,No data found
6,Algoma Township,United States,On a winding dirt road next to the Rogue River...,Hell's Bridge,Michigan,MI,,,-85.62293,43.149293,...,15.136842,44.742105,25.236842,1/1/2025,2208.0,3.9%,Michigan,Dusk,09:04,No data found
7,Algonac,United States,Morrow Road is a Haunted road in Algonac Michi...,Morrow Road,Michigan,MI,-82.57629,42.652997,-82.531018,42.618367,...,15.136842,44.742105,25.236842,3/1/2001,2208.0,3.9%,Michigan,Unknown,09:07,No data found
8,Allegan,United States,"People report hearing footsteps, and doors sla...",Elks Lodge,Michigan,MI,-85.841599,42.520552,-85.855303,42.529199,...,15.136842,44.742105,25.236842,6/1/2008,2208.0,3.9%,Michigan,Unknown,09:08,No data found
9,Allegan,United States,Various ghostly activities. News coverage abou...,The Grill House and the Rock Bottom Bar,Michigan,MI,-85.857564,42.497762,-85.855303,42.529199,...,15.136842,44.742105,25.236842,1/1/2025,2208.0,3.9%,Michigan,Dusk,09:08,No data found


In [120]:
df.shape

(10980, 60)

In [117]:
#Inspection of difference between entries between csv and tsv 
# --> this is resolved and part of an Assignment 1 "inner" merge
df_csv = pd.read_csv('../data/v1.tsv', sep='\t')
df_csv.shape

(10992, 43)

In [118]:
df_csv.columns

Index(['city', 'country', 'description', 'location', 'state', 'state_abbrev',
       'longitude', 'latitude', 'city_longitude', 'city_latitude',
       'clean_description', 'clean_spelling', 'Audio Evidence',
       'Audio Reasoning', 'time_of_day', 'Witness Count', 'Witness Reasoning',
       'tokenized', 'pos_tokenized', 'filtered_tokenized',
       'lemma_filtered_tokenized', 'lemma_pos_tokenized', 'Event',
       'apparition_types', 'apparition_descriptors', 'apparition_gender',
       'apparition_age', 'apparition_types_str', 'apparition_descriptors_str',
       'apparition_gender_str', 'apparition_age_str',
       'unique_apparition_mentions', 'adjectives',
       'unique_app_descriptor_mentions', 'FBI.Population.Covered',
       'Murder per capita', 'Violent Crime per capita',
       'Property Crime per capita', 'Undergrad_Grad_Rate', 'HS_Grad_Rate',
       'STEM_Grad_Percentage', 'Visual Evidence', 'Visual Reasoning'],
      dtype='object')