<a href="https://colab.research.google.com/github/ELehmann91/NLP/blob/master/SDG_SetFit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip3 install sentence-transformers
!pip install huggingface_hub
!pip3 install datasets
!pip install spacy
!pip install umap-learn hdbscan
!pip install --upgrade plotly
!pip install jupyter-dash
!pip install pyyaml==5.4.1

In [None]:
! apt install git-lfs
!git config --global credential.helper store

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.


## Imports

In [None]:
import math
import pandas as pd
import numpy as np

from datasets import Dataset

from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from sentence_transformers.losses.BatchHardTripletLoss import \
    BatchHardTripletLossDistanceFunction



## Parameters

In [None]:
batch_size = 16
num_epochs = 20
test_size = 0.15


loss_class_name = "BatchHardTripletLoss" # "BatchHardTripletLoss" or "CosineSimilarityLoss"
#model_name = "T-Systems-onsite/german-roberta-sentence-transformer-v2"
model_name = 'sentence-transformers/all-MiniLM-L6-v2' #'sentence-transformers/paraphrase-xlm-r-multilingual-v1' # see https://www.sbert.net/docs/pretrained_models.html
classifier_name = "knn"


loss_classes = {"None": None,
    "CosineSimilarityLoss": losses.CosineSimilarityLoss,
    "BatchHardTripletLoss": losses.BatchHardTripletLoss,
}


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load data

In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#datafile = "/content/drive/MyDrive/data/escalations-sample-SCRUBBED 20220308.csv"

In [None]:
import pandas as pd
df_osdg = pd.read_csv('https://zenodo.org/record/5550238/files/osdg-community-dataset-v21-09-30.csv',sep='\t')

In [None]:
print(df_osdg.shape)
df_osdg[:8]

(32120, 7)


Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75
1,10.18356/eca72908-en,00028349a7f9b2485ff344ae44ccfd6b,Labour legislation regulates maximum working h...,11,2,1,0.333333
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286
3,10.1787/5k9b7bn5qzvd-en,0006a887475ccfa5a7f5f51d4ac83d02,The extent to which they are akin to corruptio...,3,1,2,0.333333
4,10.1787/9789264258211-6-en,0006d6e7593776abbdf4a6f985ea6d95,A region reporting a higher rate will not earn...,3,2,2,0.0
5,10.1787/5js4xfgl4ks0-en,000b54717f2deea5d99055b4c1c2bf5a,These findings are consistent with previous wo...,10,2,5,0.428571
6,10.1787/9789264285712-7-en,000bc99895142f9b6795ddf402e16e9a,"Each section states the economic principle, re...",6,1,3,0.5
7,10.1787/9789264117563-8-en,000bfb17e9f3a00d4515ab59c5c487e7,The Israel Oceanographic and Limnological Rese...,6,0,3,1.0


In [None]:
docs = list(df_osdg['text'])

In [None]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

docs = list(df_osdg['text'].apply(lambda x:pre_process(x)))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

cv=CountVectorizer(max_df=0.85,max_features=10000,stop_words='english')
word_count_vector=cv.fit_transform(docs)

In [None]:
list(cv.vocabulary_.keys())[:10]

['gender',
 'perspective',
 'points',
 'labour',
 'markets',
 'fishing',
 'villages',
 'highly',
 'segregated',
 'terms']

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [None]:
# you only needs to do this once, this is a mapping of index to 
feature_names=cv.get_feature_names()

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
  
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results



In [None]:
def keywords(doc):
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,5)

    return ' '.join(word for word in list(keywords.keys()))


In [None]:
keywords(docs[0])

'jobs peripheral continuity young villages'

In [None]:
df_osdg['keys'] = df_osdg['text'].apply(lambda x: keywords(x))

In [None]:
df_osdg = df_osdg[df_osdg['agreement']>.95]
df_osdg = df_osdg[df_osdg['labels_positive']>1]
print(df_osdg.shape)

(13645, 8)


In [None]:
df_osdg['sdg'].value_counts()

5     1819
4     1799
3     1551
7     1186
6     1119
11    1026
1      868
13     803
8      746
2      620
14     578
9      562
15     446
10     348
12     174
Name: sdg, dtype: int64

In [None]:
_lab_dict = {0: 'no_cat',
            1:'SDG 1 - No poverty',
              2:'SDG 2 - Zero hunger',
              3:'SDG 3 - Good health and well-being',
              4:'SDG 4 - Quality education',
              5:'SDG 5 - Gender equality',
              6:'SDG 6 - Clean water and sanitation',
              7:'SDG 7 - Affordable and clean energy',
              8:'SDG 8 - Decent work and economic growth', 
             9:'SDG 9 - Industry, Innovation and Infrastructure',
              10:'SDG 10 - Reduced inequality',
             11:'SDG 11 - Sustainable cities and communities',
             12:'SDG 12 - Responsible consumption and production',
             13:'SDG 13 - Climate action',
             14:'SDG 14 - Life below water',
             15:'SDG 15 - Life on land',
             16:'SDG 16 - Peace, justice and strong institutions',
             17:'SDG 17 - Partnership for the goals',}


## Embeddings before

In [None]:
 import plotly.graph_objects as go
 import plotly.express as px
 from jupyter_dash import JupyterDash
 import dash_core_components as dcc
 import dash_html_components as html
 from dash.dependencies import Input, Output 
 from sklearn.manifold import TSNE

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  after removing the cwd from sys.path.
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  """


In [None]:
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:

labels = [_lab_dict[lab] for lab in df_osdg['sdg'] ]
keys = list(df_osdg['keys'])
docs = list(df_osdg['text'])

docs_embeddings = model.encode(docs)

print(len(labels),len(keys),len(docs),docs_embeddings.shape)

13645 13645 13645 (13645, 384)


In [None]:
import umap.umap_ as umap
n_neighbors = 15
n_components = 3
random_state =42
umap_model = (umap.UMAP(n_neighbors=n_neighbors, 
                            n_components=n_components, 
                            metric='cosine', 
                            random_state=random_state)
                        .fit(docs_embeddings))

In [None]:
docs_umap = umap_model.transform(docs_embeddings)
docs_umap.shape

(13645, 3)

In [None]:
len(keys)

13645

In [None]:
docs_umap[:,0]

array([0.8468369, 4.2885537, 8.424404 , ..., 1.8937416, 9.090541 ,
       1.278684 ], dtype=float32)

In [None]:
import pandas as pd
 
# initialize list of lists
data = [docs, labels, keys, [docs_umap[:,0]], [docs_umap[:,1]], [docs_umap[:,2]]]
 
# Create the pandas DataFrame
df_umap = pd.DataFrame(data)
 
 
# print dataframe.
print(df_umap )

                                               0      \
0  The Israel Oceanographic and Limnological Rese...   
1                 SDG 6 - Clean water and sanitation   
2      monitors quality water ministry oceanographic   
3  [0.8468369, 4.2885537, 8.424404, 9.124332, 9.3...   
4  [6.8398666, 5.341827, 10.101653, 4.9962387, 2....   
5  [2.6035333, 4.451134, 3.4887116, 5.376121, 2.2...   

                                               1      \
0  Previous chapters have discussed ways to make ...   
1                                SDG 2 - Zero hunger   
2                eat consumers food choose nutrition   
3                                               None   
4                                               None   
5                                               None   

                                               2      \
0  Prescription rates appear to be higher where l...   
1            SDG 8 - Decent work and economic growth   
2         disability opioid prescription males

In [None]:
df_umap.to_csv('sdg_umap',sep="|")

In [None]:
fig = px.scatter_3d(
    docs_umap, x=0, y=1, z=2,
    color=labels,
    opacity = .5,    hover_data=[keys])
fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False )
fig.update_traces(marker_size=4)
fig.show()

In [None]:
#fig.write_html("embedding_sdg.html")

In [None]:
exa1 = 'A Feminist Foreign Policy (FFP) is a political framework centred around the wellbeing of marginalised people and invokes processes of self-reflection regarding foreign policy’s hierarchical global systems.'
exa2 = 'A Feminist Foreign Policy (FFP) is a political framework centred around the wellbeing of marginalised people and invokes processes of self-reflection regarding foreign policy’s hierarchical global systems. FFP takes a step outside the black box approach of traditional foreign policy thinking and its focus on military force, violence, and domination by offering an alternate and intersectional rethinking of security from the viewpoint of the most vulnerable. It is a multidimensional policy framework that aims to elevate women’s and marginalised groups’ experiences and agency to scrutinise the destructive forces of patriarchy, colonisation, heteronormativity, capitalism, racism, imperialism, and militarism. CFFP believes a feminist approach to foreign policy provides a powerful lens through which we can interrogate the violent global systems of power that leave millions of people in perpetual states of vulnerability.'

In [None]:
examples = ['woman', 'man','dog','cat','car','bus','train','public transport','sustaiable agriculture','sustaiability','sustaiable cities','Feminist foreign policy']
color = ['woman','man','pets','pets','transport','transport','transport','transport','sustaiability','sustaiability','sustaiability','ffp','ffp','ffp']
examples_sen = ['the movie is about a '+e for e in  examples]
examples.append(exa1)
examples.append(exa2)

examples_sen.append(exa1)
examples_sen.append(exa2)

examples_embeddings = model.encode(examples_sen)

In [None]:
print(len(examples_sen),len(color))

15 15


In [None]:
examples_umap = umap_model.transform(examples_embeddings)

In [None]:
fig = px.scatter_3d(
    examples_umap, x=0, y=1, z=2,
    color=color,
    #hovertemplate='<b>%{text}</b><extra></extra>',
    hover_data= [examples],
    opacity = .9)
fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False )
fig.update_traces(marker_size=6)
fig.show()

In [None]:
fig.write_html("embedding_example.html")

In [None]:
import plotly.graph_objects as go
import numpy as np
import plotly.offline as pyo
import pandas as pd

fig = go.Figure(data=[go.Scatter3d(
     x=examples_umap[:,0], y=examples_umap[:,1], z=examples_umap[:,2],
    hovertemplate='<b>%{text}</b><extra></extra>',
    text = [title for title in examples],
    mode='markers',
    marker=dict(
        size=8,
        opacity=0.8,
        color=color
    )
)])


fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False )
fig.update_traces(marker_size=6)
fig.show()

ValueError: ignored

In [None]:
fig.write_html("embedding_example.html")

In [None]:
import umap.umap_ as umap
n_neighbors = 15
n_components = 2
random_state =42
umap_model = (umap.UMAP(n_neighbors=n_neighbors, 
                            n_components=n_components, 
                            metric='cosine', 
                            random_state=random_state)
                        .fit(docs_embeddings))

In [None]:
docs_umap2d = umap_model.transform(docs_embeddings)
docs_umap2d.shape

(13645, 2)

In [None]:
fig = px.scatter(
    docs_umap2d, x=0, y=1,
    color=labels,
    opacity = .5,    hover_data=[keys])
fig.update_scenes(xaxis_visible=False, yaxis_visible=False)
fig.update_traces(marker_size=4)
fig.update_layout(xaxis=dict(showgrid=False),
              yaxis=dict(showgrid=False))
fig.show()

## Split Train Test

In [None]:
train_df, test_df = train_test_split(df_osdg[['text','sdg']], test_size=test_size,stratify=df_osdg['sdg'])
print(train_df.shape,test_df.shape)

In [None]:
train_ds = Dataset.from_pandas(train_df, split="train")
test_ds = Dataset.from_pandas(test_df, split="test")

In [None]:
loss_class = loss_classes[loss_class_name]

In [None]:
#model_name = 'peter2000/bmz_topics'

In [None]:
x_train = train_ds['text']
y_train = train_ds['sdg']

x_test = test_ds['text']
y_test = test_ds['sdg']
print(len(x_train),len(x_test))

11598 2047


In [None]:
_lab_dict = {0: 'no_cat',
            1:'SDG 1 - No poverty',
              2:'SDG 2 - Zero hunger',
              3:'SDG 3 - Good health and well-being',
              4:'SDG 4 - Quality education',
              5:'SDG 5 - Gender equality',
              6:'SDG 6 - Clean water and sanitation',
              7:'SDG 7 - Affordable and clean energy',
              8:'SDG 8 - Decent work and economic growth', 
             9:'SDG 9 - Industry, Innovation and Infrastructure',
              10:'SDG 10 - Reduced inequality',
             11:'SDG 11 - Sustainable cities and communities',
             12:'SDG 12 - Responsible consumption and production',
             13:'SDG 13 - Climate action',
             14:'SDG 14 - Life below water',
             15:'SDG 15 - Life on land',
             16:'SDG 16 - Peace, justice and strong institutions',
             17:'SDG 17 - Partnership for the goals',}

y_train2 = [_lab_dict[lab] for lab in y_train ]
y_test2= [_lab_dict[lab] for lab in y_test]

In [None]:
pd.DataFrame(y_test).value_counts()

5     273
4     270
3     233
7     178
6     168
11    154
1     130
13    120
8     112
2      93
14     87
9      84
15     67
10     52
12     26
dtype: int64

## SBERT fine-tuning step

In [None]:
#Utility function if using cosine similarity loss

def sentence_pairs_generation(sentences, labels, pairs):
    # initialize two empty lists to hold the (sentence, sentence) pairs and
    # labels to indicate if a pair is positive or negative

    numClassesList = np.unique(labels)
    idx = [np.where(labels == i)[0] for i in numClassesList]

    for idxA in range(len(sentences)):
        currentSentence = sentences[idxA]
        label = labels[idxA]
        idxB = np.random.choice(idx[np.where(numClassesList == label)[0][0]])
        posSentence = sentences[idxB]
        # prepare a positive pair and update the sentences and labels
        # lists, respectively
        pairs.append(InputExample(texts=[currentSentence, posSentence], label=1.0))

        negIdx = np.where(labels != label)[0]
        negSentence = sentences[np.random.choice(negIdx)]
        # prepare a negative pair of images and update our lists
        pairs.append(InputExample(texts=[currentSentence, negSentence], label=0.0))

    # return a 2-tuple of our image pairs and labels
    return (pairs)

In [None]:
#loss_class_name = "CosineSimilarityLoss"
def fine_tuning():
  batch_size=16
  
  if loss_class_name is not None:
    if loss_class_name == "BatchHardTripletLoss":
      train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)] 
      train_data_sampler = SentenceLabelDataset(train_examples)

      batch_size = min(batch_size, len(train_data_sampler))
      train_dataloader = DataLoader(train_data_sampler, 
                                            batch_size=batch_size, 
                                            drop_last=True)
      train_loss = loss_class(model=model, 
                              distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance, 
                              margin=0.25)
      
      train_steps = len(train_dataloader) * num_epochs
      
    elif loss_class_name == "CosineSimilarityLoss":

      train_examples = [] 
      for _ in range(num_epochs):
          # sentence pairs are generated here
          train_examples = sentence_pairs_generation(np.array(x_train), np.array(y_train), train_examples)

      train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
      train_loss = loss_class(model)
      train_steps = len(train_dataloader)

    print(f"{len(x_train)} train samples in total, {train_steps} train steps with batch size {batch_size}")
    

    warmup_steps = math.ceil(train_steps*0.1) 
    # fine-tuning
    model.fit(train_objectives=[(train_dataloader, train_loss)], 
                  epochs=1, 
                  steps_per_epoch=train_steps, 
                  warmup_steps=warmup_steps, 
                  show_progress_bar=True)

  else:
    pass



In [None]:
  fine_tuning()

11598 train samples in total, 14480 train steps with batch size 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/14480 [00:00<?, ?it/s]

In [None]:
#data_df['label_orig'] = data_df['label']

In [None]:
#for i in range(1,10):
#  data_df['label'] = data_df['label_orig'].apply( lambda x: 1 if x ==i else 0)
#  fine_tuning()
#  print('fine tuning done',i/9*100,'%')

In [None]:
#data_df['label'] = data_df['label_orig'] 

## Fitting classifier

In [None]:
classifier_name = 'lr'

In [None]:
if classifier_name == 'lr':
  classifier = LogisticRegression()
elif classifier_name == "knn":
  classifier = KNeighborsClassifier(n_neighbors=15)


In [None]:
train_embeddings = model.encode(x_train)
test_embeddings = model.encode(x_test)

In [None]:
train_embeddings.shape

(11598, 384)

In [None]:
test_embeddings.shape

(2047, 384)

### Umap

In [None]:
import umap.umap_ as umap
n_neighbors = 15
n_components = 3
random_state =42
umap_model = (umap.UMAP(n_neighbors=n_neighbors, 
                            n_components=n_components, 
                            metric='cosine', 
                            random_state=random_state)
                        .fit(train_embeddings))


In [None]:
train_umap = umap_model.transform(train_embeddings)
train_umap.shape

(11598, 3)

In [None]:
test_umap = umap_model.transform(test_embeddings)
test_umap.shape

(2047, 3)

In [None]:
#forsicht

train_embeddings = train_umap
test_embeddings = test_umap

In [None]:

classifier.fit(train_embeddings, y_train2)

y_pred_train = classifier.predict(train_embeddings)
y_pred_test = classifier.predict(test_embeddings)

acc_train = accuracy_score(y_train2, y_pred_train)*100
acc_test = accuracy_score(y_test2, y_pred_test)*100

print(classification_report(y_test2, y_pred_test))

                                         precision    recall  f1-score   support

            Affordable and clean energy       0.91      0.93      0.92       178
             Clean water and sanitation       0.93      0.96      0.95       168
                         Climate action       0.90      0.87      0.89       120
        Decent work and economic growth       0.69      0.75      0.72       112
                        Gender equality       0.95      0.99      0.97       273
             Good health and well-being       0.94      0.96      0.95       233
Industry, Innovation and Infrastructure       0.79      0.81      0.80        84
                       Life below water       0.94      0.94      0.94        87
                           Life on land       0.89      0.84      0.86        67
                             No poverty       0.88      0.79      0.83       130
                      Quality education       0.95      0.96      0.95       270
                     Reduce


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
print(f"Accuracy training set: {acc_train:.2f}")
print(f"Accuracy test set: {acc_test:.2f}")

Accuracy training set: 99.78
Accuracy test set: 89.50


In [None]:
 import plotly.graph_objects as go
 import plotly.express as px
 from jupyter_dash import JupyterDash
 import dash_core_components as dcc
 import dash_html_components as html
 from dash.dependencies import Input, Output 
 from sklearn.manifold import TSNE

In [None]:
x_test[:2]

['In addition to this remark, it should also be pointed out that the amount of the transfer in relation to the average consumption of poor households is higher in rural areas than in urban areas, regardless of whether or not the cost of living is taken into account. The policy is therefore more effective at lowering the number of children living in monetary poverty in rural areas. This finding suggests that instead of distributing the same nominal amount of transfer to every recipient, it is important to find an optimal allocation of the national transfer budget. Indeed, the disparity in the effects of the crisis and policy responses is also seen at the regional level (figures 17 and 18; tables a7 and a8). In the other three regions, the transfer is not enough to completely counteract the effects of the crisis on monetary poverty.',
 'What is the picture in key emerging economies? Has the growth of income inequality been mirrored by rising inequalities of wealth, well-being and opportu

In [None]:
fig = px.scatter_3d(
    train_umap, x=0, y=1, z=2,
    color=y_train2)#,    hover_data=x_train)
fig.update_traces(marker_size=3)
fig.show()

In [None]:
x_test[:2]

['In addition to this remark, it should also be pointed out that the amount of the transfer in relation to the average consumption of poor households is higher in rural areas than in urban areas, regardless of whether or not the cost of living is taken into account. The policy is therefore more effective at lowering the number of children living in monetary poverty in rural areas. This finding suggests that instead of distributing the same nominal amount of transfer to every recipient, it is important to find an optimal allocation of the national transfer budget. Indeed, the disparity in the effects of the crisis and policy responses is also seen at the regional level (figures 17 and 18; tables a7 and a8). In the other three regions, the transfer is not enough to completely counteract the effects of the crisis on monetary poverty.',
 'What is the picture in key emerging economies? Has the growth of income inequality been mirrored by rising inequalities of wealth, well-being and opportu

In [None]:
text_plot1 = [[t[:88]] for t in x_test]
text_plot2 = [[t[88:176]] for t in x_test]
text_plot3 = [[t[176:264]] for t in x_test]

In [None]:
text_plot1[2][0]

'For those in developing countries who do have access to electricity, the supplies are of'

In [None]:
d1 = list(test_umap[:,0])
d2 = list(test_umap[:,1])
d3 = list(test_umap[:,2])

In [None]:
zipped = list(zip(text_plot1,text_plot2,text_plot3,y_test2,d1,d2,d3))

In [None]:
#fig = px.scatter_3d(
#    df_plot, x='D1', y='D2', z='D3',
#    color='label',   
#    hover_data =['text1','text2','text3'],
#    width = 2000,
#    height  = 2000,
#    template = 'plotly_white')
#fig.update_traces(marker_size=3)
#fig.show()

In [None]:

fig = px.scatter_3d(
    test_umap, x=0, y=1, z=2,
    color=y_test2)#,    hover_data=x_test)
fig.update_traces(marker_size=3)
fig.show()

In [None]:
 # load data from builtin Plotly data
 #df = px.data.gapminder()
 # prepare a sunburst figure
 #fig = px.sunburst(df, path=['continent', 'country', 'year'], values='pop',
 #                  color='lifeExp', hover_data=['iso_alpha'],
 #                  color_continuous_scale='twilight',
 #                  color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop']))
 # build jupyter dash app 
 #app = JupyterDash(__name__)
 # add html components and figure to app
 #app.layout = html.Div([dcc.Graph(figure=fig)])
 # run app inline
 #app.run_server(mode='inline') 


In [None]:
import huggingface_hub

In [None]:
model.save_to_hub(repo_name='sentence_sdg',)


Deprecated positional argument(s) used in 'create_repo': pass token='sentence_sdg' as keyword args. From version 0.12 passing these as positional arguments will result in an error,



RepositoryNotFoundError: ignored

In [None]:
model_name = model_checkpoint.split("/")[-1]
output_dir=f"{model_name}-finetuned-osdg",


In [None]:
!pip install huggingface_hub
!huggingface-cli login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load or train a model
model.save_to_hub("bmz_topics")