### Convert files to utf-8 to be used later for comparing with news websites

In [1]:
from encodings.aliases import aliases

alias_list=set([v for k, v in aliases.items()])
# alias_list = ['cp1252','utf-9']
alias_list = ['ascii',
              'base64_codec', 'cp037',
              'cp1026',
              'cp1125',
              'cp1140',
              'cp1250',
              'cp1251',
              'cp1252',
              'cp1253',
              'cp1254',
              'cp1255',
              'cp1256',
              'cp1257',
              'cp1258', 'hex_codec', 'iso8859_10',
              'iso8859_11',
              'iso8859_13',
              'iso8859_14',
              'iso8859_15',
              'iso8859_16',
              'iso8859_2',
              'iso8859_3',
              'iso8859_4',
              'iso8859_5',
              'iso8859_6',
              'iso8859_7',
              'iso8859_8',
              'iso8859_9', 'utf_16',
              'utf_16_be',
              'utf_16_le',
              'utf_32',
              'utf_32_be',
              'utf_32_le',
              'utf_7',
              'utf_8', 
              'unicode_escaped']


In [2]:
#NOTE: ONLY RUN THIS WHEN YOU ARE SATISFIED WITH RAW FILES' CONTENTS
import os
import time
import concurrent.futures

def encode_file_given_alias(alias,text):
   try:
      text = text.decode(alias).encode('utf-8')
   except Exception as e:
      pass
   return text

raw_file_names = os.listdir('../res/raw/')

start = time.time()

for file_name in raw_file_names:
   if file_name.endswith('.csv'):
      continue
   with open(f'../res/raw/{file_name}', 'rb') as f:
      text = f.read()
   with concurrent.futures.ThreadPoolExecutor() as executor:
      args = ((alias, text) for alias in alias_list)
      for result in executor.map(lambda p:  encode_file_given_alias(*p), args):
         text = result
   with open(f'../res/raw/{file_name}', 'wb') as f:
      f.write(text)
end = time.time()

print(end - start)

PermissionError: [Errno 13] Permission denied: '../res/raw/.ipynb_checkpoints'

In [None]:
# FILE WRITER
from pathlib import Path
# NOTE: To use the method, the argument must have the same name as the
# method that it is calling. Check all the methods to know what arguments to pass
# Replace will overwrite existing file with newly scraped data


def create_entity_file(output_file_name, replace=False, encoding='utf-8'):
   path = Path(f'../res/raw/{output_file_name}.txt')
   if not path.is_file() and not replace:
      entities_list = globals()[f'get_{output_file_name}_list']()
      file = open(f'../res/raw/{output_file_name}.txt', "x",encoding=encoding)
      for x in entities_list:
         file.write(f'{x}\n')

   elif path.is_file and replace:
      entities_list = globals()[f'get_{output_file_name}_list']()
      file = open(f'../res/raw/{output_file_name}.txt', "w", encoding=encoding)
      for x in entities_list:
         file.write(f'{x}\n')

   print(f'DONE CREATING {output_file_name} FILE')


### Location Preprocessor

In [None]:
def get_provinces_list():
   with open('../res/raw/tourist_dests.txt', encoding='utf-8') as province_file:
      return [province
            .lower()
            .replace('\n','') 
            .strip()
            for province in province_file.readlines()]

In [None]:

def get_tourist_dest_list():
   with open('../res/raw/tourist_dests.txt') as tourist_dest_file:
      return [tourist_dest
              .replace('\n', '')
              for tourist_dest in tourist_dest_file.readlines()]


In [None]:
import pandas as pd
def get_cities_list():
   raw_cities_df = pd.read_csv(
       '../res/raw/worldcities.csv', index_col=False, encoding='utf-8')
   ph_cities_df = raw_cities_df.loc[raw_cities_df['iso3'] == 'PHL']
   ph_cities_df = ph_cities_df[['city']]
   ph_cities_df = ph_cities_df['city']
   return ph_cities_df.to_list()

create_entity_file('cities',replace=True)


In [None]:
print('READING PROVINCE LIST')
provinces_list = get_provinces_list()
print('READING CITIES LIST')
cities_list = get_cities_list()
print('READING TOURIST DESTINATION LIST')
tourist_dest_list = get_tourist_dest_list()

print('COMPLETED READING FILES')

all_locations = provinces_list+cities_list+tourist_dest_list 
all_locations = [loc.lower() for loc in all_locations]


In [None]:
# Check if a location exists
'vigan' in ''.join(all_locations)


### Final Dataset

In [None]:
import pandas as pd
from IPython.display import display

final_dataset_df = pd.DataFrame(columns=['entity','tag'])

for file_name in os.listdir('../res/raw/'):
   entity_data_df = pd.DataFrame(columns=['entity', 'tag'])
   if file_name.endswith(".txt") and not file_name.startswith("final"):
      with open(f'../res/raw/{file_name}', errors='ignore') as file:
         file_name = file_name.split('.txt')[0]
         if file_name=='cities' or file_name=='provinces':
            tag = 'GPE'
         elif file_name=='events' or file_name =='holidays':
            tag = 'EVT'
         elif 'name' in file_name:
            tag = 'PER'
         elif file_name == 'tourist_dests':
            tag = 'LOC'
         elif file_name =='organizations' or \
              file_name =='local_companies' or \
              file_name =='gov_agencies' or \
              file_name =='gov_acronym':
               tag ='ORG'
         entities_list = []
         
         for line in file.readlines():
            line = ''.join(line.strip().lower().split('\n'))
            if ',' in line or '"' in line or len(line) < 3:
               print(line)
               continue
            else:
               entities_list.append(line)

         entity_data_df['entity'] = entities_list
         entity_data_df['tag'] = len(entity_data_df)*[tag]
         # display(entity_data_df)
         final_dataset_df =  final_dataset_df.append(entity_data_df, ignore_index=True)


final_dataset_df.to_csv('../res/preprocessed/final_dataset.csv', index=False)


In [None]:
matched_ents = ['Benjamin','Magalong','Sinulog Festival', 'Quiapo', 'Mani']
sample_text_orig = 'Mayor Benjamin Magalong spearheaded the celebration of Sinulog Festival in Quiapo, Manila'

sample_text = ' '.join(sample_text_orig.split(','))
for ent in matched_ents:
  sample_text = sample_text_orig
  sample_text =  sample_text.split(' ')
  ent = ent.split(' ')
  check = all(item in sample_text for item in ent)

  if check:
    if len(ent) ==1:
      ent =ent[0]
    else:
      ent = ' '.join(ent)
    print(ent)
    start_idx = ' '.join(sample_text).find(ent)
    if start_idx==-1:
      ent_words = ent.split(' ')
      # If more than two words yung entity
      if len(ent_words) > 1 and ent in sample_text:
        ent_first_word = ent_words[0]
        ent_last_word = ent_words[len(ent_words)-1]
        start_idx = sample_text.find(ent_first_word)
        end_idx = sample_text.find(ent_last_word)
        end_idx = end_idx + len(ent_last_word)-1
      else:
        continue
    else:
      end_idx = start_idx+len(ent)
    print(f'start: {start_idx} | end: {end_idx}')

In [None]:
sentt = 'dog'
len(sentt.split(' '))