In [23]:
import requests
import mapbox_vector_tile
import geopandas as gpd
from tqdm import tqdm

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
from scipy.stats import rankdata, norm
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

from sentence_transformers import SentenceTransformer

import nrcan_p2.data_processing.preprocessing_dfcol as preprocessing_dfcol
import nrcan_p2.data_processing.preprocessing_str as preprocessing_str
import nrcan_p2.data_processing.preprocessing_df_filter as preprocessing_df_filter
from deposit_models import systems_dict

In [24]:
def get_tile_json(x,y,zoom=4):
    url_tmpl = 'https://tileserver.staging.svc.macrostrat.org/carto/{z}/{x}/{y}'
    response = requests.get(url_tmpl.format(x=x,y=y,z=zoom))
    tile = mapbox_vector_tile.decode(response.content)
    return tile

In [25]:
def dfcol_sep_hyphen(dfcol):
    return dfcol.str.replace('-', ' - ')

In [26]:
def preproc(data_df, cols, desc_col='full_desc', filtering=False):

    # ind_invalid = ~sgmc_subset['geometry'].is_valid
    # sgmc_subset.loc[ind_invalid, 'geometry'] = sgmc_subset.loc[ind_invalid, 'geometry'].buffer(0)
    data_ = data_df.copy()

    data_[desc_col] = data_[cols].stack().groupby(level=0).agg(' '.join)
    data_[desc_col] = data_[desc_col].apply(lambda x: x.replace('-', ' - '))

    if filtering:
        pipeline = [
            dfcol_sep_hyphen,
            preprocessing_dfcol.rm_dbl_space,
            preprocessing_dfcol.rm_cid,
            preprocessing_dfcol.convert_to_ascii,
            preprocessing_dfcol.rm_nonprintable,
            preprocessing_df_filter.filter_no_letter,
            preprocessing_dfcol.rm_newline_hyphenation,
            preprocessing_dfcol.rm_newline,    
            preprocessing_df_filter.filter_no_real_words_g3letter, 
            # preprocessing_df_filter.filter_l80_real_words,
            # preprocessing_dfcol.tokenize_spacy_lg,
            # preprocessing_dfcol.rm_stopwords_spacy,
        ]

        # 
        for i, pipe_step in enumerate(pipeline):
            if pipe_step.__module__.split('.')[-1] == 'preprocessing_df_filter':
                data_ = pipe_step(data_, desc_col)
            else:
                data_[desc_col] = pipe_step(data_[desc_col])
            print(f'step {i}/{len(pipeline)} finished')

        # 
        post_processing = [
            preprocessing_str.rm_punct,     
            preprocessing_str.lower,
            preprocessing_str.rm_newline
        ]

        # 
        for i, pipe_step in enumerate(post_processing):
            data_[desc_col] = data_[desc_col].apply(pipe_step)
            print(f'step {i}/{len(post_processing)} finished')

    # 
    data_ = data_.drop(columns=['letter_count', 'is_enchant_word', 'word_char_num', 'is_enchant_word_and_g3l', 'any_enchant_word_and_g3l', 'real_words', 'real_words_n', 'real_words_perc', 'n_words', 'Shape_Area'], errors='ignore')
    data_ = data_.reset_index(drop=True)
    return data_

In [27]:
def convert_text_to_vector_hf(data, model, batch_size=64):
    vectors = []
    for i in tqdm(range(0, len(data), batch_size)):
        vectors.append(model.encode(data[i:i+batch_size]))
    vectors = np.concatenate(vectors, axis=0)
    return vectors

In [38]:
def normalize(array):
    return (array-array.min()) * (1/(array.max()-array.min()+1e-12))

In [39]:
def rank_polygon(descriptive_model, embed_model, data_, desc_col='full_desc', norm=True):

    polygon_vectors = convert_text_to_vector_hf(data_[desc_col].to_list(), embed_model)

    query_vec = {}
    cos_sim = {}
    # ipdb.set_trace()

    polygon_vectors_age_range = convert_text_to_vector_hf(data_['age'].to_list(), embed_model)
    
    for key in descriptive_model:
        query_vec[key] = convert_text_to_vector_hf([descriptive_model[key]],  embed_model)
        cos_sim[key] = cosine_similarity(query_vec[key], polygon_vectors)[0]
        if norm:
            cos_sim[key] = normalize(cos_sim[key])

    try: 
        cos_sim['age_range'] = cosine_similarity(query_vec['age_range'], polygon_vectors_age_range)[0]
    except Exception as e:
        print("Failed to compute age range score. Skipping ...")
        pass

    bge_all = 0
    for key in cos_sim:
        tmp = cos_sim[key]
        # tmp_color = float_to_color(tmp)
        bge_all += tmp
        data_['bge_'+key] = pd.Series(list(tmp))
        # data_['bge_'+key+'_color'] = pd.Series(list(tmp_color))

    bge_all /= len(cos_sim)
    # bge_all_color = float_to_color(bge_all)
    data_['bge_all'] = pd.Series(list(bge_all))
    # data_['bge_all_color'] = pd.Series(list(bge_all_color))
    return data_, cos_sim

In [30]:
tile = get_tile_json(0, 2, zoom=3)
tile_df = gpd.GeoDataFrame.from_features(tile['units']['features'])

In [31]:
tile_df[['t_int', 'b_int_id', 'b_int', 'b_int_id']]

Unnamed: 0,t_int,b_int_id,b_int,b_int_id.1
0,Quaternary,421.0,Quaternary,421.0
1,Quaternary,421.0,Quaternary,421.0
2,Late Cretaceous,34.0,Late Cretaceous,34.0
3,Paleogene,20.0,Paleogene,20.0
4,Paleogene,20.0,Paleogene,20.0
...,...,...,...,...
4508,Late Cretaceous,34.0,Late Cretaceous,34.0
4509,Proterozoic,131.0,Proterozoic,131.0
4510,Late Cretaceous,34.0,Late Cretaceous,34.0
4511,Proterozoic,131.0,Proterozoic,131.0


In [32]:
tile_df.keys()

Index(['geometry', 'map_id', 'source_id', 'legend_id', 'best_age_top',
       'best_age_bottom', 'color', 'name', 'age', 'lith', 'descrip',
       'comments', 't_int_id', 't_int', 'b_int_id', 'b_int', 'ref_url',
       'ref_name', 'ref_title', 'ref_authors', 'ref_source', 'ref_year',
       'ref_isbn'],
      dtype='object')

In [33]:
cols = ['name', 'age', 'lith', 'descrip', 'comments']
data_ = preproc(tile_df, cols)

In [34]:
data_['full_desc']

0       Sedimentary; Sedimentary: undivided Quaternary...
1       Sedimentary; Sedimentary: undivided Quaternary...
2       Sedimentary; Slope and deep water Late Cretace...
3       Paleogene volcanic: interlayered sedimentary a...
4       Paleogene volcanic: interlayered sedimentary a...
                              ...                        
4508    Sedimentary; Clastic: deltaic and nearshore La...
4509    Sedimentary; Clastic: deltaic and nearshore Pr...
4510    Sedimentary; Clastic: continental Late Cretace...
4511    Sedimentary; Carbonate Proterozoic (2500  -  5...
4512    Sedimentary; Sedimentary: undivided Proterozoi...
Name: full_desc, Length: 4513, dtype: object

In [35]:
hf_model = 'BAAI/bge-base-en-v1.5'
embed_model = SentenceTransformer(hf_model)

In [36]:
print(systems_dict.keys())

dict_keys(['porphyry_copper', 'tungsten_skarn', 'mvt_zinc_lead', 'cu_ni_pge_duluth', 'cu_ni_pge_norilsk', 'hectorite_li'])


In [40]:
deposit_type = 'porphyry_copper'

gpd_data, cos_sim = rank_polygon(systems_dict[deposit_type], embed_model, data_)


100%|██████████| 71/71 [00:07<00:00,  9.88it/s]
100%|██████████| 71/71 [00:02<00:00, 32.92it/s]
100%|██████████| 1/1 [00:00<00:00, 58.26it/s]
100%|██████████| 1/1 [00:00<00:00, 66.74it/s]
100%|██████████| 1/1 [00:00<00:00, 92.62it/s]
100%|██████████| 1/1 [00:00<00:00, 84.88it/s]
100%|██████████| 1/1 [00:00<00:00, 90.81it/s]
100%|██████████| 1/1 [00:00<00:00, 91.48it/s]
100%|██████████| 1/1 [00:00<00:00, 87.58it/s]
100%|██████████| 1/1 [00:00<00:00, 59.66it/s]
100%|██████████| 1/1 [00:00<00:00, 86.21it/s]
100%|██████████| 1/1 [00:00<00:00, 83.46it/s]
100%|██████████| 1/1 [00:00<00:00, 64.90it/s]
100%|██████████| 1/1 [00:00<00:00, 66.01it/s]
100%|██████████| 1/1 [00:00<00:00, 54.73it/s]
100%|██████████| 1/1 [00:00<00:00, 62.70it/s]


In [41]:
gpd_data

Unnamed: 0,geometry,map_id,source_id,legend_id,best_age_top,best_age_bottom,color,name,age,lith,...,bge_textures,bge_age_range,bge_depositional_environment,bge_tectonic_setting,bge_ore_mineralogy,bge_alteration,bge_ore_controls,bge_geochemical_signatures,bge_geophysical_signature,bge_all
0,"MULTIPOLYGON (((1406.000 2854.000, 1406.000 28...",654694,2,100035,0.00,2.588,#d6e165,Sedimentary; Sedimentary: undivided,Quaternary (2.6 - 0.0 Ma),sedimentary and/or volcanic rock: undivided;,...,0.753618,0.643209,0.624588,0.621328,0.560821,0.658747,0.478416,0.448044,0.703948,0.551441
1,"POLYGON ((1378.000 2932.000, 1378.000 2928.000...",654694,2,100035,0.00,2.588,#d6e165,Sedimentary; Sedimentary: undivided,Quaternary (2.6 - 0.0 Ma),sedimentary and/or volcanic rock: undivided;,...,0.753618,0.643209,0.624588,0.621328,0.560821,0.658747,0.478416,0.448044,0.703948,0.551441
2,"POLYGON ((2158.000 3140.000, 2158.000 3126.000...",655113,2,100251,66.00,100.500,#7ee000,Sedimentary; Slope and deep water,Late Cretaceous (99.6 - 65.5 Ma),"shale, chert, iron-formation, greywacke, turbi...",...,0.543349,0.649512,0.589231,0.539594,0.784240,0.639516,0.521638,0.671755,0.595073,0.564869
3,"MULTIPOLYGON (((1314.000 1812.000, 1312.000 18...",939667,7,13006,23.03,66.000,#dc833b,Paleogene volcanic: interlayered sedimentary a...,Paleogene,volcanic: interlayered sedimentary and volcani...,...,0.731115,0.642116,0.648343,0.736423,0.820914,0.667382,0.625539,0.530739,0.837179,0.666751
4,"MULTIPOLYGON (((1358.000 1786.000, 1368.000 17...",947562,7,13006,23.03,66.000,#dc833b,Paleogene volcanic: interlayered sedimentary a...,Paleogene,volcanic: interlayered sedimentary and volcani...,...,0.731115,0.642116,0.648343,0.736423,0.820914,0.667382,0.625539,0.530739,0.837179,0.666751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4508,"MULTIPOLYGON (((1358.000 2894.000, 1366.000 28...",658670,2,98871,66.00,100.500,#7ee000,Sedimentary; Clastic: deltaic and nearshore,Late Cretaceous (99.6 - 65.5 Ma),"sandstone, siltstone, shale, coal; plant fossils",...,0.553842,0.649512,0.453560,0.512160,0.635377,0.453255,0.376303,0.585210,0.402167,0.466972
4509,"POLYGON ((3494.000 4082.000, 3510.000 4076.000...",658675,2,99028,538.80,2500.000,#ed5da9,Sedimentary; Clastic: deltaic and nearshore,Proterozoic (2500 - 542 Ma),quartzite and quartz-rich sandstone,...,0.763232,0.624936,0.572100,0.771066,0.783194,0.717245,0.556257,0.757724,0.665324,0.610996
4510,"POLYGON ((1832.000 3192.000, 1842.000 3190.000...",658682,2,98599,66.00,100.500,#7ee000,Sedimentary; Clastic: continental,Late Cretaceous (99.6 - 65.5 Ma),"redbeds, quartz arenite, arkose, conglomerate,...",...,0.717796,0.649512,0.573336,0.667514,0.665988,0.515349,0.445359,0.736716,0.516120,0.553868
4511,"POLYGON ((4012.000 3808.000, 4038.000 3800.000...",658728,2,98448,538.80,2500.000,#f665b1,Sedimentary; Carbonate,Proterozoic (2500 - 542 Ma),"dolostone, silty dolostone, chert, stromatolitic",...,0.694466,0.624936,0.607259,0.749113,0.698486,0.707487,0.716163,0.581839,0.535951,0.563029
