# Labeling For All Content Types

Use this notebook the final index file and label Alignment Sheets and Figures based on their title. This is the scenario where we have already labeled the tables. This notebook will be removed during the code review and cleaning process.

# Imports

In [None]:
import pickle
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.util import ngrams
from pathlib import Path
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
ROOT_PATH = Path('.').resolve().parents[1]
print(ROOT_PATH)

# Setup

## Load our list of keywords created in keywords.py

In [None]:
with open(str(ROOT_PATH) + "\\data\\processed\\keywords_pickle\\vc_keywords.pkl", "rb") as f:
    keywords = pickle.load(f)        

In [None]:
print(keywords[14])

In [None]:
# list of VCs

labels_list = ['Landscape, terrain, and weather', 
                'Soil',
                'Plants',
                'Water',
                'Fish',
                'Wetlands',
                'Wildlife',
                'Species at Risk',
                'Greenhouse gas emissions',
                'Air emissions',
                'Noise',
                'Electricity and electromagnetism',
                'Proximity to people',
                'Archaeological, paleontological, historical, and culturally significant sites and resources',
                'Human access to boats and waterways',
                'Indigenous land, water, and air use',
                'Impact to social and cultural well-being',
                'Impact to human health and viewscapes',
                'Social, cultural, economic infrastructure and services',
                'Economic Offsets and Impact',
                'Environmental Obligations',
                'Treaty and Indigenous Rights']

# Load Final Index File

This is where we load the 'final' index file so that we can label the figures and alignment sheets.

In [None]:
df = pd.read_csv(str(ROOT_PATH) + '\\data\\interim\\Intermediate_Index_Files\\esa_index_with_table_text_no_labels.csv', encoding='utf-8-sig')
df_fig_align = df[df['Content Type'].isin(['Figure', 'Alignment Sheet'])]

In [None]:
print(len(df), len(df_fig_align))

In [None]:
df_fig_align.head()

## Labeling For All Content Types

Use this notebook the final index file and label Alignment Sheets and Figures based on their title. This is the scenario where we have already labeled the tables. This notebook will be removed during the code review and cleaning process.

In [None]:
df_fig_align['text'] = df_fig_align['Title']

In [None]:
df_fig_align.head()

In [None]:
from bs4 import BeautifulSoup

df_fig_align['text'] = df_fig_align['text'].apply(lambda x: BeautifulSoup(str(x), "html.parser").get_text(separator=' ')) # remove html tags
df_fig_align['text'] = df_fig_align['text'].replace('[^a-zA-Z0-9 ]', ' ', regex=True) # remove all non-alpha-numeric characters
df_fig_align['text'] = df_fig_align['text'].replace('\w{25,}', ' ', regex=True)
df_fig_align['text'] = df_fig_align['text'].replace('cid\d+', ' ', regex=True)
df_fig_align['text'] = df_fig_align['text'].replace(' s ', ' ', regex=True)
df_fig_align['text'] = df_fig_align['text'].replace(' +', ' ', regex=True) # remove all extra spaces in text
df_fig_align['text'] = df_fig_align['text'].str.lower()

In [None]:
df_fig_align['text'] = df_fig_align['text'].apply(lambda x: x[:30_000]) # making sure the text is not longer than 30k characters
df_fig_align.head()

In [None]:
table_texts = df_fig_align['text'].tolist()

tokenized_table_texts = []
stemmer = PorterStemmer()

for i, table_text in enumerate(table_texts):
    processed_text = word_tokenize(table_text)
    processed_text = [stemmer.stem(w) for w in processed_text if w not in stopwords.words("english")]
    table_ngram_list = []
    for n in range(1, 7):
        table_ngrams = list(ngrams(processed_text, n))
        table_ngram_list.extend([" ".join(table_gram) for table_gram in table_ngrams])

    tokenized_table_texts.append(table_ngram_list)
    
print(tokenized_table_texts[0][0:200])

In [None]:
token_figTxt_pkl_path = str(ROOT_PATH / "data" / "processed" / "keywords_pickle" / "tokenized_figure_alignment_sheets_texts.pkl")
with open(token_figTxt_pkl_path, 'wb') as f:
    pickle.dump(tokenized_table_texts, f)

In [None]:
with open(str(ROOT_PATH) + "\\data\\processed\\keywords_pickle\\tokenized_figure_alignment_sheets_texts.pkl", 'rb') as f:
    tokenized_table_text = pickle.load(f)

In [None]:
print(tokenized_table_text[0][:10], '\n', tokenized_table_text[-1][:50])

In [None]:
for label in labels_list:
    df_fig_align[f'{label}'] = 0
    # df_f_as[f'{label} - Number of Matches'] = 0
    # df_f_as[f'{label} Relevance'] = 0

df_fig_align.head(2)

In [None]:
def label_with_keywords(i, table_text, keywords_for_label, label):
    number_of_matches = len([word for word in keywords_for_label if word in table_text])
    df_fig_align[f'{label}'][i] = number_of_matches

In [None]:
def string_total_sum_match_2_lists(list_1, list_2):
    """
    This function takes two lists of strings and counts the number of total matches between the two lists, including duplicates.
    The function returns the number of total matches.
    """
    # Initialize the count variable.
    count = 0
    # Iterate over the first list.
    for item in list_1:
        # If the item is in the second list, increment the count variable.
        if item in list_2:
            count += 1
    # Return the count variable.
    return count

In [None]:
df_fig_align.text.iloc[-1]

In [None]:
df_fig_align.index[0]

In [None]:
for i, table_text in zip(df_fig_align.index, tokenized_table_text):
    for keywords_for_label, label in zip(keywords, labels_list):
        number_of_matches = string_total_sum_match_2_lists(table_text, keywords_for_label)
        df_fig_align[f'{label}'][i] = number_of_matches

esa_fig_alignment_labeled_path = str(ROOT_PATH / "data" / "processed" / "keywords_pickle" / "esa_index_ENG_fig_alignment_labeled.pkl")
with open(esa_fig_alignment_labeled_path, 'wb') as f:
    pickle.dump(df_fig_align, f)

df_fig_align.head()

# Drop text and remerge index

In [None]:
df_fig_align.drop(columns=['text'], inplace=True)
df_fig_align.drop(columns=['label'], inplace=True)

In [None]:
df_fig_align.to_csv(str(ROOT_PATH) + '\\data\\interim\\Intermediate_Index_Files\\esa_figure_alignment_vec_labeled.csv', index=False, encoding='utf-8-sig')

In [29]:
df_tables = pd.read_csv(str(ROOT_PATH) + '\\data\\interim\\Intermediate_Index_Files\\esa_tables_vec_labeled.csv', encoding='utf-8-sig')

In [33]:
df_esa_vecs = df_fig_align.append(df_tables, ignore_index=True)

In [34]:
print(df_fig_align.shape, df_tables.shape, df_esa_vecs.shape)

(29, 55) (473, 55) (502, 55)


In [38]:
df_esa_vecs.drop(columns=['ID'], inplace=True)
df_esa_vecs.insert(len(df_esa_vecs.columns), 'ID', range(21425, 21425 + len(df_esa_vecs)))

In [35]:
df_esa_vecs.to_csv(str(ROOT_PATH) + '\\data\\interim\\Intermediate_Index_Files\\esa_vecs_labeled.csv', index=False, encoding='utf-8-sig')

# Normalize match values

In [None]:
import numpy as np
max_value_list = []
df_norm = df.copy()
for content_type in ['Table', 'Figure', 'Alignment Sheet']:
    df_content_type = df[df['Content Type'] == content_type]
    for label in labels_list:
        df_log = np.log2(df_content_type[f'{label}'].replace(0, np.nan) + 1)
        max_vc = df_log.max()
        updated_vc_col = df_log / max_vc * 100
        updated_vc_col.replace(np.nan, 0, inplace=True)
        df_content_type[f'{label}'] = np.ceil(updated_vc_col).astype(int)
        max_value_list.append([content_type, label, max_vc])
    df_norm.loc[df_content_type.index] = df_content_type

In [None]:
(unique, counts) = np.unique(df_norm, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

In [None]:
df_norm_tables = df_norm[df_norm['Content Type'] == 'Table']
df_norm_alignment_sheets = df_norm[df_norm['Content Type'] == 'Alignment Sheet']
df_norm_figures = df_norm[df_norm['Content Type'] == 'Figure']

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 3, sharex='col', sharey='row')

num = 1

df_norm_tables.hist(column = labels_list[num], bins = 30, ax=ax[0], figsize=(10, 10))
df_norm_alignment_sheets.hist(column = labels_list[num], bins = 30, ax=ax[1], figsize=(10, 10))
df_norm_figures.hist(column = labels_list[num], bins = 30, ax=ax[2], figsize=(10, 10))


In [None]:
df_norm.tail()

In [None]:
df = df_norm.copy()

# Adding Alignment Sheet in Alignment Sheet rows with Figure in Title

In [None]:
df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('Figure')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']
df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('FIGURE')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']
df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('figure')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']

df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('Table')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']
df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('TABLE')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']
df.loc[(df['Content Type'] == 'Alignment Sheet') & (df['Title'].str.contains('table')), 'Title'] = '(Alignment Sheet) ' + df.loc[(df['Content Type'] == 'Alignment Sheet'), 'Title']
df.tail()

# Load Alignment Sheet Titles for French Index

In [None]:
Data_Files_Path = str(Path().resolve().parents[1]) + '\\Data_Files\\'
print(Data_Files_Path)

In [None]:
df_new_align = pd.read_csv(Data_Files_Path + "Intermediate_Index_Files\\new_alignment_sheet_titles_for_translation_FR_fixed.csv")

In [None]:
df_new_align.tail()

# Add VCs to French Index

In [None]:
df_fra = pd.read_csv(final_index_path + 'ESA_website_FRA.csv')

df_fra.head(2)

In [None]:
# grabbing the VC columns to put match numbers in French file
df_fra.iloc[:, 29:51] = df.iloc[:, 29:51]
df_fra.iloc[:, 29:51].head()

In [None]:
ids_to_update = df[df["ID Internal"].isin(df_new_align["ID"] + '_a_1')]['ID Internal'].tolist()

In [None]:
idx_list = []
for i, id in enumerate(ids_to_update):
    id = id
    idx = df_fra[df_fra["ID Internal"] == id].index[0]
    # print(idx)
    idx_list.append(idx)
    df_fra.loc[idx, 'Titre'] = df_new_align['Titre'][i]
    # print(df_new_align['Titre'][i])
df_fra.iloc[idx_list[20:25]]

# Update the Alignment Sheet French Titles

In [None]:
df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('Figure')), 'Titre'] = '(Carte-tracé) ' + df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre']
df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('FIGURE')), 'Titre'] = '(Carte-tracé) ' + df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre']
df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('figure')), 'Titre'] = '(Carte-tracé) ' + df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre']

df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('Tableau')), 'Titre'] = '(Carte-tracé) ' + str(df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre'])
df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('TABLEAU')), 'Titre'] = '(Carte-tracé) ' + str(df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre'])
df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé') & (df_fra['Titre'].str.contains('tableau')), 'Titre'] = '(Carte-tracé) ' + str(df_fra.loc[(df_fra['Type de contenu'] == 'Carte-tracé'), 'Titre'])
df_fra.tail()

# 

In [None]:
df_fra.head()

# Remove Duplicates

In [None]:
remove_duplicate_as = ['1059614_45_a_1',
'2392795_110_a_1',
'2393296_31_a_1',
'3334565_29_a_1',
'3334565_31_a_1',
'3334565_43_a_1',
'3334565_44_a_1',
'3337530_34_a_1',
'3337530_35_a_1',
'3337530_6_a_1',
'3340309_17_a_1',
'3340309_21_a_1',
'3341938_15_a_1',
'3341938_25_a_1',
'3341938_31_a_1',
'3341938_32_a_1',
'3341938_33_a_1',
'3342531_19_a_1',
'3342531_4_a_1',
'3342531_45_a_1',
'3891804_134_a_1',
'464812_45_a_1',
'464812_46_a_1',
'464812_47_a_1',
'464812_48_a_1']

In [None]:
for id in remove_duplicate_as:
    df.drop(df[df['ID Internal'] == id].index[0], inplace=True)
    df_fra.drop(df_fra[df_fra['ID Internal'] == id].index[0], inplace=True)

In [None]:
df[df["ID Internal"] == remove_duplicate_as[3]]

# Sanity Checks

Let's have a look at all the rows where there were no matches for any of the VCs. We want to make sure this is only because none of the text matched with any of the keywords in our list.

In [None]:
# After running this cell, open up an index file with the text content and search for the table title to see if it makes sense that such a row has 0 matches.

df[df.iloc[:, 29:51].sum(axis=1) == 0].head()

# Save Labeled Index File

In [None]:
import time
current_time = time.strftime('%Y-%m-%d_%H-%M-%S')
print(current_time)

In [None]:
save_index_path = str(Path().resolve().parents[1]) + '\\Output_Files\\final_index_files\\'

In [None]:
df.to_csv(save_index_path + f'ESA_website_ENG_{current_time}.csv', encoding='utf-8-sig')
df_fra.to_csv(save_index_path + f'ESA_website_FRA_{current_time}.csv', encoding='utf-8-sig')