# Скачаем данные постов и сделаем из них эмбеддинги с помощью трансформеров

# Данные состоят из id, текста и топика

In [None]:
# скачаем данные постов и сделаем из них эмбеддинги с помощью трансформеров
import pandas as pd
posts_info=pd.read_csv('../yourPath/post_text',index_col=0)

In [4]:
posts_info

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [5]:


from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):# выбираем модель трансформера для создания эмбеддингов
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
tokenizer, model = get_model('distilbert')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])
    
    
dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [8]:
import torch
from tqdm import tqdm

# Функция для получения эмбеддингов 
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [9]:
# Смотрим обучаемся ли мы на gpu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [10]:
# Получаем эмбеддинги
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

  0%|          | 0/220 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 220/220 [01:48<00:00,  2.03it/s]


array([[ 3.63150716e-01,  4.89373505e-02, -2.64081180e-01, ...,
        -1.41593218e-01,  1.59182400e-02,  9.19411832e-05],
       [ 2.36416489e-01, -1.59501046e-01, -3.27798218e-01, ...,
        -2.89936244e-01,  1.19365521e-01, -1.62345846e-03],
       [ 3.75191540e-01, -1.13943920e-01, -2.40546837e-01, ...,
        -3.38919461e-01,  5.86939044e-02, -2.12657358e-02],
       ...,
       [ 3.40382695e-01,  6.64923638e-02, -1.63184494e-01, ...,
        -8.65628794e-02,  2.03403875e-01,  3.20906416e-02],
       [ 4.32091892e-01,  1.10916262e-02, -1.17306069e-01, ...,
         7.54015967e-02,  1.02739766e-01,  1.52742090e-02],
       [ 3.04277748e-01, -7.62156770e-02, -6.77585602e-02, ...,
        -5.43488115e-02,  2.44383752e-01, -1.41487354e-02]], dtype=float32)

In [11]:
embeddings[0]

array([ 3.63150716e-01,  4.89373505e-02, -2.64081180e-01, -1.60671532e-01,
       -2.11843416e-01, -2.07537025e-01,  3.25393647e-01, -2.61391941e-02,
        8.72662514e-02, -1.04142034e+00, -2.12647095e-01, -3.11571918e-02,
        3.01327500e-02, -1.51711926e-01, -3.68476480e-01,  1.89907417e-01,
        7.82294497e-02, -1.44073460e-02, -1.93641130e-02, -2.50731260e-01,
        1.59897819e-01, -8.89073759e-02,  5.83013237e-01, -4.07209754e-01,
        1.33548111e-01,  6.02782518e-02, -4.66265082e-02,  2.74484754e-01,
       -2.83439368e-01,  3.60531718e-01, -9.35291350e-02,  1.00304559e-01,
       -1.44219622e-01, -3.39949876e-02, -8.34017694e-02,  6.75634295e-02,
       -6.29682764e-02, -4.11192477e-01, -8.31328258e-02,  7.62807354e-02,
       -4.21976388e-01,  1.14475012e-01,  4.58835095e-01, -2.44741008e-01,
       -1.68751273e-02, -4.36022490e-01, -6.52725771e-02,  2.76180685e-01,
       -7.11755008e-02,  8.51354450e-02,  5.70032671e-02,  4.63857986e-02,
       -3.45954932e-02,  

In [12]:
# Сделаем конкат эмбеддингов и изначального датасета
pd.concat([posts_info,pd.DataFrame(embeddings)],axis=1)

Unnamed: 0,post_id,text,topic,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,1,UK economy facing major risks\n\nThe UK manufa...,business,0.363151,0.048937,-0.264081,-0.160672,-0.211843,-0.207537,0.325394,...,0.349080,0.290132,-0.244970,0.078532,0.137399,0.208097,-0.058624,-0.141593,0.015918,0.000092
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,0.236416,-0.159501,-0.327798,-0.372885,-0.292128,-0.019028,0.333677,...,0.311639,0.297819,-0.177003,0.130227,-0.063239,0.190171,-0.018153,-0.289936,0.119366,-0.001623
2,3,Asian quake hits European shares\n\nShares in ...,business,0.375192,-0.113944,-0.240547,-0.282425,-0.264252,0.061839,0.249180,...,0.353615,0.308457,-0.207151,0.056724,0.056596,0.125301,0.021575,-0.338919,0.058694,-0.021266
3,4,India power shares jump on debut\n\nShares in ...,business,0.273770,-0.048748,-0.440433,-0.189999,-0.410856,-0.100587,0.255757,...,0.321182,0.218213,-0.267988,-0.093800,0.176987,0.251618,0.028331,-0.155708,0.136188,0.044054
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,0.297853,-0.073203,-0.146820,-0.127284,-0.133966,0.045766,0.176495,...,0.168949,0.208978,-0.051180,0.045685,0.173986,0.148893,0.097255,-0.239587,0.228066,0.189832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie,0.338545,0.084620,-0.225981,-0.115433,-0.064816,-0.129863,0.358163,...,0.495477,0.203647,-0.138003,0.148754,0.138935,0.199463,0.054088,-0.110224,0.039229,-0.003550
7019,7316,I give this movie 2 stars purely because of it...,movie,0.354057,0.053933,-0.099446,-0.161002,0.009353,-0.190430,0.258928,...,0.322798,0.176826,-0.154205,-0.010798,0.100222,0.093780,0.051934,-0.119376,0.182106,0.072430
7020,7317,I cant believe this film was allowed to be mad...,movie,0.340383,0.066492,-0.163184,-0.115224,-0.102958,-0.181814,0.346562,...,0.372651,0.169435,-0.041522,-0.033723,0.047250,0.173592,-0.027378,-0.086563,0.203404,0.032091
7021,7318,The version I saw of this film was the Blockbu...,movie,0.432092,0.011092,-0.117306,-0.123570,0.066756,-0.103376,0.243299,...,0.468103,0.156609,-0.054083,0.210478,0.116214,0.064117,0.084667,0.075402,0.102740,0.015274


In [13]:
from sklearn.decomposition import PCA

In [14]:
# Пытаемся кластеризовать тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean() # центрируем

pca = PCA(n_components=200)#200
pca_decomp = pca.fit_transform(centered)

In [16]:
# Смотрим как получилось сжать наше векторное пространство
pd.DataFrame(pca_decomp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.789076,1.578993,-1.421168,-0.292827,0.196368,-0.482341,-0.172337,-0.252787,0.128675,-0.029782,...,0.035872,0.009104,0.007713,0.009915,-0.045963,0.009822,0.013671,0.004447,-0.028394,0.020320
1,0.792764,1.521055,-0.897748,0.228625,0.109591,0.125352,-0.163200,0.005968,-0.349691,-0.423545,...,-0.068475,-0.005844,0.025755,0.056258,-0.048177,0.019858,-0.037381,0.032571,0.004372,-0.006529
2,0.801291,1.223608,-0.669459,1.323415,-0.081440,0.550813,0.049819,-0.241444,-0.509032,-0.007893,...,-0.001653,0.004427,-0.013491,-0.062006,0.042344,-0.010743,-0.042754,-0.046203,0.041511,-0.018178
3,0.867122,0.971222,-1.577194,0.833189,0.841210,-0.292233,0.742464,0.886731,-0.494612,0.055690,...,0.014707,-0.028171,0.018439,0.063803,-0.043762,0.042641,-0.018259,-0.011401,-0.006428,0.028607
4,0.411659,0.809848,-0.682370,0.790952,-0.186122,-0.596708,0.169843,-0.758109,-0.193134,0.064076,...,0.056084,-0.059007,-0.017118,0.015681,0.022887,0.002939,-0.034056,0.159203,-0.025282,-0.032693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-0.983509,-0.483878,0.001958,0.066756,0.059204,0.342644,0.026169,-0.575163,-0.049805,0.668213,...,-0.007379,-0.031603,-0.029769,0.046996,-0.003333,0.037440,0.035074,0.021003,0.016825,0.014974
7019,-0.821778,-0.706064,-0.164490,-0.391025,0.029578,0.233311,0.336856,-0.425721,0.112193,0.308404,...,0.061569,-0.066234,0.006532,0.012065,0.023508,-0.074792,-0.026847,-0.027215,0.037000,0.008856
7020,-0.469401,-0.874857,0.482072,-0.138476,-0.257578,0.545998,-0.031880,-0.287169,-0.504229,0.519211,...,0.065777,-0.055649,0.046106,-0.012686,-0.015054,-0.053550,-0.046358,0.000651,0.029030,0.057322
7021,-1.579793,-0.448612,0.126690,0.100852,0.284504,-0.370514,-0.190615,-0.049475,-0.189615,0.021910,...,0.017929,0.029535,0.007817,0.009148,-0.071760,-0.015575,-0.012297,-0.039187,-0.022571,-0.017858


In [19]:
# Кластеризуем
from sklearn.cluster import KMeans

n_clusters = 25

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]#создаем названия колонкам 

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()



Unnamed: 0,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,...,DistanceToCluster_15,DistanceToCluster_16,DistanceToCluster_17,DistanceToCluster_18,DistanceToCluster_19,DistanceToCluster_20,DistanceToCluster_21,DistanceToCluster_22,DistanceToCluster_23,DistanceToCluster_24
0,3.49396,3.677949,3.622222,2.624224,3.507957,4.1161,2.562215,2.563882,3.50928,3.390471,...,3.589725,2.195352,3.060268,3.447935,3.539969,3.043197,3.840153,1.615548,3.034917,3.509522
1,3.391038,3.388828,3.491359,2.483304,3.562264,3.716697,2.559283,2.501192,3.408025,3.346797,...,3.170051,2.333223,2.849084,3.194299,3.472124,2.90801,3.652429,2.409076,2.749064,3.329164
2,3.443156,3.414664,3.662007,2.641631,3.68521,3.497526,2.606119,3.410068,3.542429,3.152861,...,3.377886,2.006821,3.210594,3.354251,3.567143,3.167842,3.70836,2.660001,3.071656,3.372932
3,3.902483,4.003796,4.338183,3.061545,3.930743,4.346416,3.14851,3.763012,3.966123,3.488611,...,4.09809,2.638316,3.763006,3.768011,3.826235,3.458106,3.184834,3.08666,3.574337,3.819856
4,3.156396,3.295736,3.690556,2.542242,3.235155,3.532949,2.497357,3.502832,3.162741,3.339728,...,3.361131,2.025326,2.725691,2.986656,3.397157,2.985263,3.643212,2.736466,2.657373,3.221667


In [20]:
posts_info = pd.concat((posts_info, dists_df), axis=1)

posts_info.drop(["text"], axis=1, inplace=True)



In [21]:
posts_info

Unnamed: 0,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,...,DistanceToCluster_15,DistanceToCluster_16,DistanceToCluster_17,DistanceToCluster_18,DistanceToCluster_19,DistanceToCluster_20,DistanceToCluster_21,DistanceToCluster_22,DistanceToCluster_23,DistanceToCluster_24
0,1,business,22,3.493960,3.677949,3.622222,2.624224,3.507957,4.116100,2.562215,...,3.589725,2.195352,3.060268,3.447935,3.539969,3.043197,3.840153,1.615548,3.034917,3.509522
1,2,business,11,3.391038,3.388828,3.491359,2.483304,3.562264,3.716697,2.559283,...,3.170051,2.333223,2.849084,3.194299,3.472124,2.908010,3.652429,2.409076,2.749064,3.329164
2,3,business,11,3.443156,3.414664,3.662007,2.641631,3.685210,3.497526,2.606119,...,3.377886,2.006821,3.210594,3.354251,3.567143,3.167842,3.708360,2.660001,3.071656,3.372932
3,4,business,16,3.902483,4.003796,4.338183,3.061545,3.930743,4.346416,3.148510,...,4.098090,2.638316,3.763006,3.768011,3.826235,3.458106,3.184834,3.086660,3.574337,3.819856
4,5,business,16,3.156396,3.295736,3.690556,2.542242,3.235155,3.532949,2.497357,...,3.361131,2.025326,2.725691,2.986656,3.397157,2.985263,3.643212,2.736466,2.657373,3.221667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,movie,0,1.488507,2.901770,3.638558,2.850415,1.919571,3.642879,2.989966,...,3.381111,3.135253,2.778432,1.983461,3.277282,2.942382,2.632631,3.322540,2.397041,2.423227
7019,7316,movie,0,1.229421,2.549908,3.605439,2.562518,1.400757,3.781902,2.710257,...,3.348915,3.107972,2.543351,1.707974,3.044986,2.611935,2.476388,3.091437,2.356037,2.158313
7020,7317,movie,0,1.560368,2.305110,3.646631,2.905352,2.067369,3.715922,2.998021,...,3.473960,3.291089,2.801755,2.108639,3.011369,2.636890,2.848584,3.350442,2.492338,2.516752
7021,7318,movie,4,1.754707,3.231492,3.599391,3.026342,1.326326,3.752878,3.237907,...,3.428394,3.299857,2.845697,1.725319,3.546212,3.189278,2.266984,3.433430,2.166726,2.287015


In [22]:
posts_info.to_csv("embeddings_post_text")