# **Vector TransForms**

In [3]:
import os
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
from scipy.sparse import csr_matrix
from datasets import load_dataset

# type
from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable, Dict
from pathlib import Path

# model
from sentence_transformers import SentenceTransformer
from umap import UMAP

# find topic 
# from scipy.sparse import csr_matrix
# from sklearn.metrics.pairwise import cosine_similarity

# data_dir = "../../data_store"

class Generate_Vector:
    
    def __init__(self, 
                 embedding_model: SentenceTransformer = None, 
                 reduce_dimension_model: UMAP = None, 
                 batch_size: int = 64, 
                 verbose: bool = False, 
                 save_path: Path = None):
        
        self.verbose = verbose
        self.embedding_model = embedding_model or SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
        self.reduce_dimension_model = reduce_dimension_model or UMAP(random_state=random.randint(0, 2**32 - 1),verbose=self.verbose)
        self.batch_size = batch_size
        self.save_path = save_path

    def input(self, data: List[str]):
        self.data = data
        
    def word_embedding(self, docs: List[Union[float, int, str]]) -> np.array:
        
        vector = self.embedding_model.encode(docs, 
                    show_progress_bar=self.verbose, 
                    batch_size=self.batch_size)
        
        return vector
    
    def reduce_dimension(self, vector: np.array) -> UMAP:
        reduced_vector = self.reduce_dimension_model.fit(vector)
        return reduced_vector
        
    def run(self) -> [Dict[str, Any], pd.DataFrame]:

        # load-data

        if self.verbose : 
            print("embedding -> reduction dimension ...")

        vector = self.word_embedding(self.data)
        self.reduce_dimension(vector)
        
        self.data = pd.DataFrame(self.data, columns=['data_process'])
        self.data['vector'] = [i for i in vector]
        self.data['x'] = self.reduce_dimension_model.embedding_[:,0]
        self.data['y'] = self.reduce_dimension_model.embedding_[:,1]
        self.data['data_display'] = "X: " + round(self.data['x'], 4).astype(str) + " Y: " + round(self.data['y'], 4).astype(str) + " Content: " + self.data['data_process']

        model_log_dict = {'sentence_transform': {'model':self.embedding_model, 'vector':vector}, 
                          'umap': self.reduce_dimension_model}
        
        if self.save_path is not None : 

            os.makedirs(self.save_path, exist_ok=True)
            self.data.to_parquet(os.path.join(self.save_path, 'data.parquet'))

            with open(os.path.join(self.save_path, f'model.pkl'), 'wb') as pickle_file:
                pickle.dump(model_log_dict, pickle_file)

        return [model_log_dict, self.data]

In [7]:
dataset = load_dataset('ag_news')
df = pd.DataFrame(dataset['train'])
df.shape

(120000, 2)

In [8]:
df = df.sample(3000)

In [9]:
df

Unnamed: 0,text,label
112135,Artest testifies about basketball brawl NBA In...,1
100867,Aussies says Indon #39;s delay in revealing em...,0
62282,Els and Westwood Build Halfway Mach Play Leads...,1
55506,Opposition Alleges Afghan Election Fraud KABUL...,0
94119,SBC to use Microsoft software SAN ANTONIO SBC ...,2
...,...,...
76837,Europe vote 'early 2006' - Straw Jack Straw sa...,0
22619,Microsoft gives businesses more time to test S...,3
90446,"Post-Arafat Gaza Sees Turmoil GAZA CITY, Gaza ...",0
74373,After the battle comes the war The three-way b...,2


In [11]:
docs = df['text'].tolist()
docs[:5]

['Artest testifies about basketball brawl NBA Indiana Pacers #39; Ron Artest and three teammates testified about the basketball brawl. Ron Artest took the witness stand and described, in his words, one of the worst brawls ',
 'Aussies says Indon #39;s delay in revealing embassy bombing arrests &lt;b&gt;...&lt;/b&gt; Australia accepted that Indonesian police delayed for almost three weeks revealing they had captured four suspects in the Australian Embassy bombing in Jakarta in September because they hoped to catch more, Foreign Minister ',
 'Els and Westwood Build Halfway Mach Play Leads  VIRGINIA WATER, England (Reuters) - Defending champion  Ernie Els and Briton Lee Westwood built healthy leads at the  halfway point of the World Match Play semi-finals at Wentworth  on Saturday.',
 "Opposition Alleges Afghan Election Fraud KABUL, Afghanistan - Afghanistan's historic presidential election turned sour Saturday when all 15 candidates opposing U.S.-backed interim President Hamid Karzai wit

# **Use Default**

In [18]:
generator = Generate_Vector()



In [19]:
generator.input(docs)

In [20]:
model_log, vector_rs = generator.run()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [22]:
model_log

{'sentence_transform': {'model': SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  ),
  'vector': array([[-4.76044742e-03,  1.18321680e-01,  1.75755173e-02, ...,
           9.74698216e-02,  2.80996971e-02,  6.73345327e-02],
         [ 2.57304888e-02,  3.67047451e-02, -1.92093488e-03, ...,
           2.99612321e-02, -6.71113729e-02,  3.17012668e-02],
         [-2.75205285e-03,  1.32744126e-02,  2.21810266e-02, ...,
          -5.71120307e-02,  1.82199068e-02,  5.58311269e-02],
         ...,
         [ 4.13802192e-02,  7.64816776e-02,  2.42398214e-02, ...,
          -4.19711173e-02, -1.55534402e-01, 

In [24]:
model_log.get('sentence_transform').get('model')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [25]:
model_log.get('umap')

In [26]:
vector_rs

Unnamed: 0,data_process,vector,x,y,data_display
0,Artest testifies about basketball brawl NBA In...,"[-0.0047604474, 0.11832168, 0.017575517, -0.05...",1.958568,2.543490,X: 1.9586 Y: 2.5435 Content: Artest testifies ...
1,Aussies says Indon #39;s delay in revealing em...,"[0.025730489, 0.036704745, -0.0019209349, -0.0...",6.525950,6.741454,X: 6.526 Y: 6.7415 Content: Aussies says Indon...
2,Els and Westwood Build Halfway Mach Play Leads...,"[-0.0027520529, 0.013274413, 0.022181027, -0.0...",3.392436,2.246645,X: 3.3924 Y: 2.2466 Content: Els and Westwood ...
3,Opposition Alleges Afghan Election Fraud KABUL...,"[-0.09492922, 0.058308825, 0.09119129, -0.0235...",7.044231,8.153275,X: 7.0442 Y: 8.1533 Content: Opposition Allege...
4,SBC to use Microsoft software SAN ANTONIO SBC ...,"[0.014641171, -0.11993363, 0.04298304, -0.1013...",10.686739,3.550588,X: 10.6867 Y: 3.5506 Content: SBC to use Micro...
...,...,...,...,...,...
2995,Europe vote 'early 2006' - Straw Jack Straw sa...,"[-0.00013601137, -0.015127623, 0.0014578794, -...",8.405495,7.068664,X: 8.4055 Y: 7.0687 Content: Europe vote 'earl...
2996,Microsoft gives businesses more time to test S...,"[-0.033268206, 0.0007132999, 0.044689704, 0.00...",9.407422,2.914732,X: 9.4074 Y: 2.9147 Content: Microsoft gives b...
2997,"Post-Arafat Gaza Sees Turmoil GAZA CITY, Gaza ...","[0.04138022, 0.07648168, 0.024239821, -0.00292...",5.181494,8.945205,X: 5.1815 Y: 8.9452 Content: Post-Arafat Gaza ...
2998,After the battle comes the war The three-way b...,"[-0.080011584, -0.048400294, -0.00012885108, -...",10.392657,4.885274,X: 10.3927 Y: 4.8853 Content: After the battle...


# **Custom Parameters**

In [27]:
embed = SentenceTransformer('all-mpnet-base-v2', device='cpu')
reducer = UMAP(n_neighbors=32, min_dist=0.3)



In [31]:
generator = Generate_Vector(embedding_model=embed, reduce_dimension_model=reducer, batch_size=120, verbose=1)

In [32]:
generator.input(docs)

In [33]:
model_log, vector_rs = generator.run()

embedding -> reduction dimension ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

In [34]:
vector_rs 

Unnamed: 0,data_process,vector,x,y,data_display
0,Artest testifies about basketball brawl NBA In...,"[-0.027241958, -0.002923223, 0.012313358, 0.03...",1.526669,4.267148,X: 1.5267 Y: 4.2671 Content: Artest testifies ...
1,Aussies says Indon #39;s delay in revealing em...,"[0.012355018, 0.059535187, -0.024927009, -0.00...",6.849057,1.502299,X: 6.8491 Y: 1.5023 Content: Aussies says Indo...
2,Els and Westwood Build Halfway Mach Play Leads...,"[-0.031126834, -0.021199824, -0.0050896113, 0....",1.057176,2.395051,X: 1.0572 Y: 2.3951 Content: Els and Westwood ...
3,Opposition Alleges Afghan Election Fraud KABUL...,"[0.021714766, 0.019451853, 0.018280506, 0.0062...",5.583340,3.026026,X: 5.5833 Y: 3.026 Content: Opposition Alleges...
4,SBC to use Microsoft software SAN ANTONIO SBC ...,"[-0.010478984, 0.012031325, 0.006473758, -0.03...",6.185642,8.021123,X: 6.1856 Y: 8.0211 Content: SBC to use Micros...
...,...,...,...,...,...
2995,Europe vote 'early 2006' - Straw Jack Straw sa...,"[-0.019955186, 0.021531342, 0.028552027, -0.01...",6.772058,3.891392,X: 6.7721 Y: 3.8914 Content: Europe vote 'earl...
2996,Microsoft gives businesses more time to test S...,"[-0.015543528, 0.05301379, -0.052917793, -0.04...",4.939428,8.209493,X: 4.9394 Y: 8.2095 Content: Microsoft gives b...
2997,"Post-Arafat Gaza Sees Turmoil GAZA CITY, Gaza ...","[-0.006034758, 0.0419839, 0.007277789, -0.0035...",5.349021,0.559426,X: 5.349 Y: 0.5594 Content: Post-Arafat Gaza S...
2998,After the battle comes the war The three-way b...,"[0.04033836, 0.04225431, 0.039968, -0.01221434...",6.791982,6.448570,X: 6.792 Y: 6.4486 Content: After the battle c...


In [37]:
model_log.get('umap').get_params()

{'a': None,
 'angular_rp_forest': False,
 'b': None,
 'dens_frac': 0.3,
 'dens_lambda': 2.0,
 'dens_var_shift': 0.1,
 'densmap': False,
 'disconnection_distance': None,
 'force_approximation_algorithm': False,
 'init': 'spectral',
 'learning_rate': 1.0,
 'local_connectivity': 1.0,
 'low_memory': True,
 'metric': 'euclidean',
 'metric_kwds': None,
 'min_dist': 0.3,
 'n_components': 2,
 'n_epochs': None,
 'n_jobs': -1,
 'n_neighbors': 32,
 'negative_sample_rate': 5,
 'output_dens': False,
 'output_metric': 'euclidean',
 'output_metric_kwds': None,
 'precomputed_knn': (None, None, None),
 'random_state': None,
 'repulsion_strength': 1.0,
 'set_op_mix_ratio': 1.0,
 'spread': 1.0,
 'target_metric': 'categorical',
 'target_metric_kwds': None,
 'target_n_neighbors': -1,
 'target_weight': 0.5,
 'tqdm_kwds': {'desc': 'Epochs completed',
  'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]',
  'disable': True},
 'transform_mode': 'embedding',
 'transform_queue_size'

# **Import**

In [6]:
from model.generate_vector import Generate_Vector