We'll start off by setting the seed for reproducibility.

In [1]:
import sys
import torch
import random
import numpy as np

seed = 26

In [2]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate
    !pip install accelerate -U
    !pip install -U sentence-transformers

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to desired folder
    %cd /content/drive/MyDrive/HS_23_Msc

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Reading data

In [3]:
from transformers import pipeline
import pandas as pd
import torch

In [4]:
# Reading in the .csv data
dat = pd.read_csv('Embeddings_text.csv')
dat # Inspecting the data

Unnamed: 0,word+comment
0,Klimagesetz
1,Teuer
2,Stromteuerung
3,Hohe Kosten Stromleitungen
4,PV privat Unsinn
...,...
6672,Verbot fossiler Energie Direkt alle fossile En...
6673,Unterstützung für Umstellung Wann wird unterst...
6674,Schutz für Generationen Wir schützen die zukün...
6675,Lösung von Ausland Wir müssen danach weniger i...


# Extracting features

In [5]:
# Setting device to mps
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [6]:
# Initializing feature extractor
model_ckpt = 'sentence-transformers/distiluse-base-multilingual-cased-v1'
feature_extractor = pipeline(
    'feature-extraction', model=model_ckpt, tokenizer=model_ckpt, device=device,
    framework='pt'
)

# Extracting features
texts = dat['word+comment'].tolist()
features = [feature_extractor(text, return_tensors='pt')[0][0].numpy() for text in texts]
features = pd.DataFrame(features)
features

Downloading (…)lve/main/config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.095611,0.021990,0.126141,-0.021954,0.013680,-0.102875,0.016304,0.116780,0.020373,0.037267,...,-0.071235,-0.071900,-0.130064,-0.035499,0.006495,-0.035788,-0.154047,-0.100707,-0.029562,0.077092
1,0.020845,-0.005634,0.154591,-0.066823,-0.029795,-0.013122,-0.032534,0.033369,-0.101048,0.098474,...,-0.066866,0.141183,-0.074486,0.148635,-0.044970,0.014635,0.034756,-0.118851,0.050913,0.130866
2,-0.021304,0.041870,0.230338,-0.161837,-0.005615,-0.096298,0.083327,0.070974,-0.126866,-0.026267,...,-0.089843,0.087850,-0.164344,0.023796,-0.009120,0.005613,-0.066628,-0.063911,-0.051326,0.038994
3,-0.082307,-0.043797,0.215392,-0.205434,0.040324,-0.241189,0.063837,0.139529,-0.158085,-0.028806,...,-0.060269,-0.034217,-0.174236,0.022430,0.063282,-0.081791,-0.079611,0.050799,-0.099930,-0.067327
4,-0.013512,0.076630,0.152870,-0.031402,0.200175,-0.259090,0.030944,0.106287,0.031104,0.040489,...,0.110275,0.078572,-0.021006,-0.064581,0.000386,-0.125664,0.037954,0.094915,-0.034505,0.039215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6672,-0.256144,-0.036322,0.018177,0.094034,-0.160906,-0.159593,0.088160,-0.021140,0.068167,-0.127818,...,-0.033512,-0.130360,-0.032130,-0.125889,0.001596,0.074469,-0.028163,-0.060314,-0.022905,-0.063235
6673,-0.101566,-0.076121,0.016676,0.078314,0.064883,0.026105,-0.046002,-0.036989,0.154582,0.104860,...,0.017274,-0.193359,-0.068224,-0.048902,-0.089292,-0.123839,-0.023397,-0.081478,0.023823,0.045237
6674,-0.044039,-0.146573,0.070712,-0.050378,-0.021161,0.012585,0.127261,0.120985,0.039316,0.017532,...,0.055553,-0.158965,-0.159623,0.119226,-0.006483,0.030219,-0.140611,-0.067981,-0.046988,-0.044217
6675,-0.065021,-0.088122,-0.033085,0.133250,-0.056526,0.049599,0.039607,0.053319,0.049965,0.018521,...,0.079714,-0.123590,-0.038652,0.012391,0.100350,-0.055524,-0.094243,0.091258,0.031485,0.034855


In [7]:
features.to_csv('features.csv')