In [2]:
import pandas as pd

# Merging Process
In order to merge 'v2_with_geo_count.tsv' with 'spacy.tsv' we needed to take the unordered index results from 'v2_with_geo_count.tsv' and set it as the intended index. Then 'v2_with_geo_count.tsv' had it's index sorted and was then merged into 'spacy.tsv'

---
Prior attempts to merge the dataframe were unsuccessful: 
We tried inner and left merge on the key "description" but this yielded surplus and unexpected merge results. Some probably reasons why were that descriptions may have been duplicated and thus the merging key was not fully unique and reliable as a key.
- Those unsuccessful codes have been removed to declutter the ipynb

In [3]:
df_spacy = pd.read_csv("../Data/spacy.csv")
print(df_spacy.columns)
print(df_spacy.head())

Index(['spacy_entities', 'description'], dtype='object')
                                      spacy_entities  \
0  Ada witch -, PERSON\n3-mile, QUANTITY\nthe Ada...   
1              month later, DATE\nthis day, DATE\n\n   
2  Gorman Rd, PERSON\nSand Creek, FAC\nA mile, QU...   
3  1970, DATE\none, CARDINAL\n211, CARDINAL\ntoda...   
4  Kappa Delta Sorority - The Kappa Delta Sororit...   

                                         description  
0  Ada witch - Sometimes you can see a misty blue...  
1  A little girl was killed suddenly while waitin...  
2  If you take Gorman Rd. west towards Sand Creek...  
3  In the 1970's, one room, room 211, in the old ...  
4  Kappa Delta Sorority - The Kappa Delta Sororit...  


In [4]:
df_geo_orig = pd.read_csv("../Data/v2_with_geo_count.tsv", delimiter="\t")
print(df_geo_orig.head())

   Unnamed: 0  Alcohol Deaths Alcohol Deaths Under 21 Audio Evidence  \
0           3          2208.0                    3.9%             No   
1           1          2208.0                    3.9%            Yes   
2           0          2208.0                    3.9%            Yes   
3           2          2208.0                    3.9%             No   
4           4          2208.0                    3.9%            Yes   

                                     Audio Reasoning  \
0  The description details various phenomena such...   
1  The description includes instances of knocking...   
2  The description includes reports of hearing th...   
3  The description does not mention any specific ...   
4  The description includes reports of giggling a...   

  Daylight Data TimeandDate Daylight Data USNO Navy         Event  \
0             No data found                   11:45  Supernatural   
1             No data found                   11:22        Murder   
2             No data f

  df_geo_orig = pd.read_csv("../Data/v2_with_geo_count.tsv", delimiter="\t")


In [5]:
df_geo_sorted = df_geo_orig.set_index("Unnamed: 0")
#df_geo_sorted = df_geo_sorted.reset_index()

print(df_geo_sorted.head())

            Alcohol Deaths Alcohol Deaths Under 21 Audio Evidence  \
Unnamed: 0                                                          
3                   2208.0                    3.9%             No   
1                   2208.0                    3.9%            Yes   
0                   2208.0                    3.9%            Yes   
2                   2208.0                    3.9%             No   
4                   2208.0                    3.9%            Yes   

                                              Audio Reasoning  \
Unnamed: 0                                                      
3           The description details various phenomena such...   
1           The description includes instances of knocking...   
0           The description includes reports of hearing th...   
2           The description does not mention any specific ...   
4           The description includes reports of giggling a...   

           Daylight Data TimeandDate Daylight Data USNO Navy

In [6]:
df_geo_sorted = df_geo_sorted.sort_index(ascending=True)
print(df_geo_sorted.head())

            Alcohol Deaths Alcohol Deaths Under 21 Audio Evidence  \
Unnamed: 0                                                          
0                   2208.0                    3.9%            Yes   
1                   2208.0                    3.9%            Yes   
2                   2208.0                    3.9%             No   
3                   2208.0                    3.9%             No   
4                   2208.0                    3.9%            Yes   

                                              Audio Reasoning  \
Unnamed: 0                                                      
0           The description includes reports of hearing th...   
1           The description includes instances of knocking...   
2           The description does not mention any specific ...   
3           The description details various phenomena such...   
4           The description includes reports of giggling a...   

           Daylight Data TimeandDate Daylight Data USNO Navy

In [7]:
print(df_geo_sorted['description'])

Unnamed: 0
0        Ada witch - Sometimes you can see a misty blue...
1        A little girl was killed suddenly while waitin...
2        If you take Gorman Rd. west towards Sand Creek...
3        In the 1970's, one room, room 211, in the old ...
4        Kappa Delta Sorority - The Kappa Delta Sororit...
                               ...                        
10975    at 12 midnight you can see a lady with two lit...
10976    Is haunted by the victims of a murder that hap...
10977    The institution was for kids 18 years old and ...
10978    Gymnasium -  their have been reports of a litt...
10979    Cadets from the Air Force Academy participatin...
Name: description, Length: 10980, dtype: object


In [8]:
print(df_geo_sorted.columns)

Index(['Alcohol Deaths', 'Alcohol Deaths Under 21', 'Audio Evidence',
       'Audio Reasoning', 'Daylight Data TimeandDate',
       'Daylight Data USNO Navy', 'Event', 'FBI.Population.Covered',
       'GeoName_Count', 'Geographic_LATITUDE', 'Geographic_LONGITUDE',
       'Geographic_NAME', 'HS_Grad_Rate', 'Haunted Places Date',
       'Murder per capita', 'Optional_LATITUDE1', 'Optional_LATITUDE2',
       'Optional_LATITUDE3', 'Optional_LATITUDE4', 'Optional_LONGITUDE1',
       'Optional_LONGITUDE2', 'Optional_LONGITUDE3', 'Optional_LONGITUDE4',
       'Optional_NAME1', 'Optional_NAME2', 'Optional_NAME3', 'Optional_NAME4',
       'Property Crime per capita', 'STEM_Grad_Percentage', 'State',
       'Undergrad_Grad_Rate', 'Violent Crime per capita', 'Visual Evidence',
       'Visual Reasoning', 'Witness Count', 'Witness Reasoning', 'adjectives',
       'apparition_adj_str', 'apparition_age', 'apparition_age_str',
       'apparition_descriptors', 'apparition_descriptors_str',
       'appa

In [14]:
df_geo_sorted_filtered = df_geo_sorted[['description','GeoName_Count', 'Geographic_LATITUDE', 'Geographic_LONGITUDE',
       'Geographic_NAME','Optional_LATITUDE1', 'Optional_LATITUDE2',
       'Optional_LATITUDE3', 'Optional_LATITUDE4', 'Optional_LONGITUDE1',
       'Optional_LONGITUDE2', 'Optional_LONGITUDE3', 'Optional_LONGITUDE4',
       'Optional_NAME1', 'Optional_NAME2', 'Optional_NAME3', 'Optional_NAME4']]
df_geo_sorted_filtered.to_csv('../Data/v2_with_geo_count_sorted_filtered.csv')
print("'v2_with_geo_count_sorted_filtered.csv' saved")

'v2_with_geo_count_sorted_filtered.csv' saved


In [10]:
df_merge_index = df_spacy.merge(df_geo_sorted, left_index= True, right_index= True, how="left")
print(df_merge_index.head())

                                      spacy_entities  \
0  Ada witch -, PERSON\n3-mile, QUANTITY\nthe Ada...   
1              month later, DATE\nthis day, DATE\n\n   
2  Gorman Rd, PERSON\nSand Creek, FAC\nA mile, QU...   
3  1970, DATE\none, CARDINAL\n211, CARDINAL\ntoda...   
4  Kappa Delta Sorority - The Kappa Delta Sororit...   

                                       description_x  Alcohol Deaths  \
0  Ada witch - Sometimes you can see a misty blue...          2208.0   
1  A little girl was killed suddenly while waitin...          2208.0   
2  If you take Gorman Rd. west towards Sand Creek...          2208.0   
3  In the 1970's, one room, room 211, in the old ...          2208.0   
4  Kappa Delta Sorority - The Kappa Delta Sororit...          2208.0   

  Alcohol Deaths Under 21 Audio Evidence  \
0                    3.9%            Yes   
1                    3.9%            Yes   
2                    3.9%             No   
3                    3.9%             No   
4         

In [11]:
df_merge_index = df_merge_index.rename(columns = {'description_x': 'description'})
print(df_merge_index.columns)

Index(['spacy_entities', 'description', 'Alcohol Deaths',
       'Alcohol Deaths Under 21', 'Audio Evidence', 'Audio Reasoning',
       'Daylight Data TimeandDate', 'Daylight Data USNO Navy', 'Event',
       'FBI.Population.Covered', 'GeoName_Count', 'Geographic_LATITUDE',
       'Geographic_LONGITUDE', 'Geographic_NAME', 'HS_Grad_Rate',
       'Haunted Places Date', 'Murder per capita', 'Optional_LATITUDE1',
       'Optional_LATITUDE2', 'Optional_LATITUDE3', 'Optional_LATITUDE4',
       'Optional_LONGITUDE1', 'Optional_LONGITUDE2', 'Optional_LONGITUDE3',
       'Optional_LONGITUDE4', 'Optional_NAME1', 'Optional_NAME2',
       'Optional_NAME3', 'Optional_NAME4', 'Property Crime per capita',
       'STEM_Grad_Percentage', 'State', 'Undergrad_Grad_Rate',
       'Violent Crime per capita', 'Visual Evidence', 'Visual Reasoning',
       'Witness Count', 'Witness Reasoning', 'adjectives',
       'apparition_adj_str', 'apparition_age', 'apparition_age_str',
       'apparition_descriptors', 'a

In [12]:
df_sorted_filterd = df_merge_index[['description', 'spacy_entities', 'GeoName_Count','Geographic_LATITUDE',
       'Geographic_LONGITUDE', 'Geographic_NAME','Optional_LATITUDE1',
       'Optional_LATITUDE2', 'Optional_LATITUDE3', 'Optional_LATITUDE4',
       'Optional_LONGITUDE1', 'Optional_LONGITUDE2', 'Optional_LONGITUDE3',
       'Optional_LONGITUDE4', 'Optional_NAME1', 'Optional_NAME2',
       'Optional_NAME3', 'Optional_NAME4']]

print(df_sorted_filterd.head())

                                         description  \
0  Ada witch - Sometimes you can see a misty blue...   
1  A little girl was killed suddenly while waitin...   
2  If you take Gorman Rd. west towards Sand Creek...   
3  In the 1970's, one room, room 211, in the old ...   
4  Kappa Delta Sorority - The Kappa Delta Sororit...   

                                      spacy_entities  GeoName_Count  \
0  Ada witch -, PERSON\n3-mile, QUANTITY\nthe Ada...              2   
1              month later, DATE\nthis day, DATE\n\n              0   
2  Gorman Rd, PERSON\nSand Creek, FAC\nA mile, QU...              0   
3  1970, DATE\none, CARDINAL\n211, CARDINAL\ntoda...              0   
4  Kappa Delta Sorority - The Kappa Delta Sororit...              0   

   Geographic_LATITUDE  Geographic_LONGITUDE Geographic_NAME  \
0                  NaN                   NaN             NaN   
1                  NaN                   NaN             NaN   
2                  NaN                   NaN

Iterim merged dataset that has relevant columns for Spacy and GeoParser. Will later be mergeed with GenAI image columns


In [13]:
df_sorted_filterd.to_csv('../Data/merged_spacy_and_geo_v2.csv')
print("'merged_spacy_and_geo_v2.csv' saved")

'merged_spacy_and_geo_v2.csv' saved
