In [1]:
#Import drive
from google.colab import drive
#Mount Google Drive
ROOT="/content/drive"
drive.mount(ROOT, force_remount=True)

Mounted at /content/drive


In [2]:
%pwd

'/content'

In [3]:
%cd /content/drive/MyDrive/dlss24

/content/drive/MyDrive/dlss24


In [4]:
%pwd

'/content/drive/MyDrive/dlss24'

# Data

Download the corpora data from: https://codeocean.com/capsule/0078777/tree/v1

Uploaded to a location outside your gitHub (to do not be tracked)

You pwd should have the files:


1.   source_corpus.csv




# Module IV:  - Class 7 - Text Classifiers



# Reading data

In [5]:
import pandas as pd


In [6]:
df_source_corpus=pd.read_csv('/content/drive/MyDrive/dlss24/source_corpus.csv')

In [7]:
df_source_corpus = df_source_corpus.dropna(subset=['text'])

In [8]:
df_source_corpus.head()

Unnamed: 0,topic_44,topic_8,text
0,democracy,freedom and democracy,"Two hundred summers ago, this Democratic Party..."
1,freedom and human rights,freedom and democracy,"In 1992, the party Thomas Jefferson founded in..."
2,law and order,fabric of society,Our land reverberates with a battle cry of fru...
3,no topic,no topic,America is on the wrong track.
4,civic mindedness,fabric of society,The American people are hurting.


# Word2vec

[Image of word2vec 1-hidden layer NN](https://becominghuman.ai/mathematical-introduction-to-glove-word-embedding-60f24154e54c)

Word2vec creates vectors that represent the context of words, while GloVe creates vectors that represent the co-occurrence of words.

Word2vec uses a shallow neural network to create vectors, while GloVe uses a global matrix factorization technique


Word2vec requires a large amount of training data, while GloVe can be trained on smaller datasets. This makes GloVe more suitable for smaller tasks, while Word2vec is better suited for larger applications.

In [9]:
# word2vec requires sentences as input
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from string import punctuation
translator = str.maketrans('','',punctuation)
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
#from previous class
def normalize_text(doc):
    "Input doc and return clean list of tokens"
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower() # all lower case
    nopunc = lower.translate(translator) # remove punctuation
    words = nopunc.split() # split into tokens
    nostop = [w for w in words if w not in stoplist] # remove stopwords
    no_numbers = [w if not w.isdigit() else '#' for w in nostop] # normalize numbers
    stemmed = [stemmer.stem(w) for w in no_numbers] # stem each word
    return stemmed



> 1. Apply the normalize_text function to the data (df_source_corpus)
2. Train word2vec on all of the data (All that is required is that the input yields one sentence (list of utf8 words) after another.)



In [None]:
from gensim.models import Word2Vec

In [12]:
sentences_normalized=[normalize_text(row) for row in df_source_corpus['text']]


In [14]:
df_source_corpus['text'][0]

'Two hundred summers ago, this Democratic Party was founded by the man whose burning pen fired the spirit of the American Revolution - who once argued we should overthrow our own government every 20 years to renew our freedom and keep pace with a changing world.'

In [15]:
sentences_normalized[0]

['two',
 'hundr',
 'summer',
 'ago',
 'democrat',
 'parti',
 'found',
 'man',
 'whose',
 'burn',
 'pen',
 'fire',
 'spirit',
 'american',
 'revolut',
 'argu',
 'overthrow',
 'govern',
 'everi',
 '#',
 'year',
 'renew',
 'freedom',
 'keep',
 'pace',
 'chang',
 'world']

In [None]:
sentences_normalized[44]

['reject',
 'donoth',
 'govern',
 'last',
 'twelv',
 'year',
 'big',
 'govern',
 'theori',
 'say',
 'hamstr',
 'busi',
 'tax',
 'spend',
 'way',
 'prosper']

In [16]:
# train the model
from gensim.models import Word2Vec
w2v = Word2Vec(sentences_normalized,  # list of tokenized sentences
               workers = 8, # Number of threads to run in parallel
               vector_size=300,  # Word vector dimensionality
               min_count =  25, # Minimum word count
               window = 5, # Context window size
               sample = 1e-3, # Downsample setting for frequent words
               )


#w2v.save('w2v-vectors.pkl')



<gensim.models.word2vec.Word2Vec at 0x7c7a828f3ca0>



> 1. Look for the word vectors for immigration, economy, security and rights
2. See which of these (economy, security and rights) is closer to immigration

Try w2v.wv.



In [19]:
 len(w2v.wv.index_to_key)

3557

In [20]:
w2v.wv.index_to_key[0]

'#'

In [21]:
w2v.wv.index_to_key['immigr']

TypeError: list indices must be integers or slices, not str

In [36]:
#immigration
[i for i in w2v.wv.index_to_key if i[0:2]=='im']

['improv',
 'implement',
 'import',
 'impact',
 'immigr',
 'immedi',
 'impos',
 'immunis',
 'imagin',
 'imprison',
 'immens',
 'implic',
 'imposs',
 'imag',
 'imper',
 'impedi',
 'impair',
 'imbal',
 'imposit',
 'immun',
 'imparti',
 'imped',
 'immers']

In [33]:
#economy
[i for i in w2v.wv.index_to_key if i[0:2]=='ec']

['econom', 'economi', 'ece', 'ecolog', 'ecosystem', 'economist']

In [None]:
#security
 [i for i in w2v.wv.index_to_key if i[0:4]=='secu']

['secur']

In [None]:
#rights
 [i for i in w2v.wv.index_to_key if i[0:4]=='righ']

['right']

In [None]:
w2v.wv.similarity('immigr','econom') # similarity

0.06183112

In [None]:
w2v.wv.similarity('immigr','secur') # similarity

0.1503622

In [None]:
w2v.wv.similarity('immigr','right') # similarity

0.20254493

In [37]:
w2v.wv.most_similar('immigr') # most similar words

[('asylum', 0.6509559750556946),
 ('deter', 0.58147132396698),
 ('refuge', 0.5697773098945618),
 ('seeker', 0.5590358972549438),
 ('deport', 0.5226292014122009),
 ('coven', 0.5181108117103577),
 ('visa', 0.5159613490104675),
 ('grievanc', 0.5148912072181702),
 ('illeg', 0.513752818107605),
 ('copyright', 0.5129243731498718)]



> Analogies with the dataset:
1. Scientist is to man as __ is to woman
2. Scientist is to woman as __ is to man



In [38]:
# analogies: economy is to man as __ is to woman
w2v.wv.most_similar(positive=['scientist','man'],
                 negative=['woman'])

[('leader', 0.6559566855430603),
 ('format', 0.6077491641044617),
 ('bid', 0.6055317521095276),
 ('elit', 0.5831114053726196),
 ('brightest', 0.5734731554985046),
 ('philanthropi', 0.5641298294067383),
 ('discuss', 0.556861162185669),
 ('globalis', 0.555897057056427),
 ('stakehold', 0.5556820631027222),
 ('distinct', 0.5549700856208801)]

In [None]:
w2v.wv.most_similar(positive=['scientist','woman'],
                 negative=['man'])

[('clinician', 0.652355968952179),
 ('bilingu', 0.5782425403594971),
 ('fás', 0.567655622959137),
 ('autism', 0.5564168691635132),
 ('sandf', 0.5482902526855469),
 ('leisur', 0.546715259552002),
 ('advisor', 0.5436047911643982),
 ('distinct', 0.53558748960495),
 ('athlet', 0.5323788523674011),
 ('horizon', 0.5315958261489868)]

# GLove





> Train Glove on your dataset and check most similar words to environment



In [39]:
!pip install glove-python3

Collecting glove-python3
  Downloading glove_python3-0.1.0.tar.gz (326 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: glove-python3
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python3: filename=glove_python3-0.1.0-cp310-cp310-linux_x86_64.whl size=1065513 sha256=64db91b479179434533dadb02685018cf0a8eac0f6bcb5b2b7b79863c328e792
  Stored in directory: /root/.cache/pip/wheels/fe/2f/79/34314d44a0907e90e323c8c182ec23f126eb460829e02d98cf
Successfully built glove-python3
Installing collected packages: glove-python3
Successfully installed glove-python3-0.1.0


In [40]:
import itertools
from glove import Corpus, Glove

In [41]:
corpus = Corpus()
corpus.fit(sentences_normalized, window=10)
glove = Glove(no_components=100, learning_rate=0.05)

In [42]:
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [None]:
glove.word_vectors[glove.dictionary['environment']].shape

(100,)

In [None]:
glove.word_vectors[glove.dictionary['environment']]

array([ 1.72157389e-01,  5.24760726e-01,  4.79366011e-01,  2.44504808e-01,
        8.62694774e-02, -4.30167892e-01,  3.50070104e-01,  1.96383570e-01,
       -3.74036851e-02,  9.22049750e-02,  4.17906902e-02,  8.54516579e-02,
       -6.95356852e-02,  1.20410553e-01,  5.61525833e-01, -7.81944678e-02,
       -7.11222706e-02, -1.69982273e-02,  8.98567312e-02, -2.08754969e-01,
        6.95545429e-02, -3.52967573e-01, -4.62215007e-01,  4.16381245e-01,
        2.05167332e-01, -2.51081403e-01, -3.56390643e-01, -2.34925635e-01,
       -4.79705582e-02,  2.10012633e-02, -3.81413661e-01,  6.28390035e-02,
       -1.52516746e-01,  3.80466818e-03, -1.57986222e-01, -1.79531348e-01,
       -3.48323882e-01, -1.68420041e-01, -5.08048309e-02, -1.87594650e-01,
       -1.18914746e-01,  1.26961687e-01, -2.63438630e-01, -3.27405734e-02,
       -5.47137856e-01,  2.11711152e-01,  9.77844705e-04,  6.26988483e-02,
        1.60322921e-01,  1.39169804e-01, -2.14607942e-01, -2.22307912e-01,
       -3.98466476e-02,  

In [None]:
glove.most_similar('environment')

[('impact', 0.6321464721438153),
 ('stewardship', 0.6001896002229782),
 ('sustain', 0.5874697464152396),
 ('ecolog', 0.575077843653751)]



# 2017: Attention is all you need



In [43]:
# Attention Basically
import numpy as np

class Node:

  def __init__(self):

    # the vector stored at this node (what is learned, the output of the layer)
    self.data=np.random.randn(20)

    # weights governing how this node interacts with other nodes
    self.wkey = np.random.randn(20, 20)
    self.wquery = np.random.randn(20, 20)
    self.wvalue = np.random.randn(20, 20)

  def key(self):
    # what do I have?
    return self.wkey @ self.data

  def query(self):
    # what am I looking for? (next work is the text shift by 1, classification is the class)
    return self.wquery @ self.data

  def value(self):
    # what do I oublicly reveal to others?
    return self.wvalue @ self.data


In [44]:
class Graph:

  def __init__(self):
    # make 10 nodes
    self.nodes=[Node() for _ in range(10)]
    #make 40 edges
    randi=lambda: np.random.randint(len(self.nodes))
    self.edges=[[randi(),randi()] for _ in range(40)]

  def run(self):

    updates=[]
    for i,n in enumerate(self.nodes):

      #what is the node looking for?
      q=n.query()

      #find all edges that are input to this node
      inputs = [self.nodes[ifrom] for (ifrom, ito) in self.edges if ito==i]
      if len(inputs)==0:
        continue #ignore, next in for loop
      # gather keys, i.e what they hold
      keys=[m.key() for m in inputs]
      #calculate compatibilities: dot product of  key with query
      scores=[k.dot(q) for k in keys]
      #softmax them so they sum to 1
      scores=np.exp(scores)
      scores=scores/np.sum(scores)
      # gather the appropriate values with weighted sum
      values=[m.value() for m in inputs]
      update=sum([s*v for s,v in zip(scores, values)])
      updates.append(update)

    for n,u in zip(self.nodes, updates):
      n.data=n.data +u #residual connection



> 1. Create a Graph
2. Print the key, value and query for the 4th node
3. Print the data before running and after running



In [45]:
g=Graph()

In [50]:
g.nodes[0].query()

array([ -2.26009653,  -3.86570119, -10.68306983,   2.10093105,
         8.24468375,   0.32323647,   1.03318534,  -2.5471518 ,
        -3.84820401,  -2.54643845,   2.94953266,   2.28183714,
        -2.9826758 ,   2.54974613,  -6.74749392,   0.87090985,
         6.94793187,   6.238638  ,   4.26249733,  -8.57029551])

In [None]:
g.nodes[0].key()

array([ 6.80817114, -0.16310615,  0.09909053,  6.74913728, -3.27475085,
        1.68078406, -0.59136218, -5.39002545,  8.12771051,  0.04429295,
       -8.38573856, -8.08030379, -5.17804022,  2.67510497,  7.03893079,
       -3.71964299,  4.1559444 ,  4.44817306, -1.81178428,  3.76416691])

In [51]:
g.nodes[0].value()

array([-0.16796905,  3.38603153,  8.87232844,  2.85645858,  3.83613868,
       -8.92490701,  2.09601661,  5.98362151,  0.93092894, -1.53902913,
       -3.77749424, -0.52533548, -1.75477106, -0.49328547, -2.74778709,
       -0.01105748,  5.67891244,  8.33002298,  0.42194456, -0.41449461])

In [None]:
g.nodes[0].query()

array([11.5282681 , -0.40117409, -3.86519441,  3.856318  ,  2.24188279,
        3.19754436, -1.51368747, -3.3892236 , -5.78609625, -0.05760908,
       -3.41750032,  3.31154274,  0.71510075, -1.1500739 , -7.68822842,
       -2.91063493,  3.46468184,  5.48590603,  1.22081941, -4.19792393])

In [52]:
g.nodes[0].data

array([-1.14056506, -1.23900008,  0.53904634,  0.94699827,  1.76198684,
       -0.10946224, -0.04302294,  0.81154406, -0.01547008, -0.2008717 ,
       -0.00457421,  1.98391545,  1.25505927, -0.627131  ,  0.34393967,
        0.13177731,  2.24757624, -0.44489159, -0.82046772, -0.28209497])

In [53]:
g.run()

In [54]:
g.nodes[0].data

array([ 2.97638451, -2.69937069, -0.65788764, -0.31446105,  0.69543204,
        6.45238175,  5.86212886, -1.25776177,  2.4952229 , -0.24442091,
        1.63603448,  0.10129498, -1.95933993, -0.83745152, -2.49197691,
       -6.64497786, -3.26629945, -3.4498158 , -1.6729622 , -4.9820016 ])

# Transformers from scratch

[Documentation](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
# Define the scaled dot-product function
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

# Define the expand mask function
def expand_mask(mask):
    assert mask.ndim >= 2, "Mask must be at least 2-dimensional with seq_length x seq_length"
    if mask.ndim == 3:
        mask = mask.unsqueeze(1)
    while mask.ndim < 4:
        mask = mask.unsqueeze(0)
    return mask

In [None]:
# Define the MultiheadAttention class
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = nn.Linear(input_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, _ = x.size()
        if mask is not None:
            mask = expand_mask(mask)
        qkv = self.qkv_proj(x)
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, seq_length, self.embed_dim)
        o = self.o_proj(values)
        if return_attention:
            return o, attention
        else:
            return o

# Define the EncoderBlock class
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()
        self.self_attn = MultiheadAttention(input_dim, input_dim, num_heads)
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            nn.Linear(dim_feedforward, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        linear_out = self.linear_net(x)
        x = x + self.dropout(linear_out)
        x = self.norm2(x)
        return x

# Define the TransformerEncoder class
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, **block_args):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(**block_args) for _ in range(num_layers)])

    def forward(self, x, mask=None):
        for l in self.layers:
            x = l(x, mask=mask)
        return x

    def get_attention_maps(self, x, mask=None):
        attention_maps = []
        for l in self.layers:
            _, attn_map = l.self_attn(x, mask=mask, return_attention=True)
            attention_maps.append(attn_map)
            x = l(x)
        return attention_maps

# Define the PositionalEncoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x




In [None]:
# Define the Transformer-based text classifier
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dim_feedforward, num_classes, max_len=5000, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.encoder = TransformerEncoder(
            num_layers=num_layers,
            input_dim=d_model,
            num_heads=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.pre_classifier = nn.Linear(d_model, d_model)
        self.classifier = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, mask=None):
        x = self.embedding(input_ids)
        x = self.positional_encoding(x)
        x = self.encoder(x, mask=mask)
        x = x[:, 0]  # Use the [CLS] token (first token) for classification
        x = F.relu(self.pre_classifier(x))
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits


In [None]:

# Example usage
vocab_size = 30522  # Vocabulary size (BERT's vocab size)
d_model = 768  # Embedding size
num_heads = 12  # Number of attention heads
num_layers = 6  # Number of transformer layers
dim_feedforward = 3072  # Feedforward network hidden size
num_classes = 2  # Number of output classes (e.g., binary classification)
max_len = 512  # Maximum sequence length
dropout = 0.1  # Dropout rate

model = TransformerClassifier(vocab_size, d_model, num_heads, num_layers, dim_feedforward, num_classes, max_len, dropout)

# Assume we have a tokenizer that converts sentences to input_ids
# For demonstration, using random input_ids
input_ids = torch.randint(0, vocab_size, (1, max_len))  # Batch size 1, sequence length max_len
mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)  # Mask for non-padding tokens

logits = model(input_ids, mask)
print(logits)

tensor([[0.1719, 0.2699]], grad_fn=<AddmmBackward0>)


In [None]:
mask.shape

torch.Size([1, 1, 1, 512])

In [None]:
#batch size, seq length
input_ids.shape


#In the attention mechanism,
#the mask needs to be broadcastable to the shape [batch_size, num_heads, seq_length, seq_length]
#[batch_size, 1, seq_length, seq_length]: the num_heads will be matched in the code by the number of heads

torch.Size([1, 512])

# Transformers: Hugging Face

[Hugging Face](https://huggingface.co/docs/transformers/main_classes/tokenizer)

In [None]:
#!pip install transformers
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# gpu or cpu?
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)

cpu


In [None]:
model_name = 'distilbert-base-uncased' # huggingface model_ID or path to folder
model = DistilBertForSequenceClassification.from_pretrained(model_name)
print (model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 



> Apply that to our dataset



In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
inputs = tokenizer(df.iloc[0]['text'], return_tensors="pt")
print(inputs)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

NameError: name 'df' is not defined

In [None]:
inputs = tokenizer(df['opinion_text'].tolist(), return_tensors="pt", padding=True, truncation=True)
labels = torch.tensor(df['x_republican'].tolist()).long()
print(inputs, labels)



> Manifesto Berta

[Documentation](https://manifesto-project.wzb.eu/information/documents/manifestoberta)



In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

sentence = "We will restore funding to the Global Environment Facility and the Intergovernmental Panel on Climate Change, to support critical climate science research around the world"

inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
print(probabilities)
# {'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02...

predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)
# 501 - Environmental Protection: Positive
