Теория - https://habr.com/ru/companies/otus/articles/702838/

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import pandas as pd
import numpy as np
import nltk
from string import punctuation
import re
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('punkt')

stop_words = nltk.corpus.stopwords.words('english') + list(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
stop_words[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning (ITHUB) /ДИ 2023/lessons/data/Tweets.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
df['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [6]:
df = df[:5000] #долго обучается, взял срез

In [7]:
df['sentiment'].value_counts()

neutral     2022
positive    1593
negative    1385
Name: sentiment, dtype: int64

In [8]:
df['sentiment']=df['sentiment'].map({'positive':1,'neutral':0,'negative':0})

In [9]:
def preprocesing_text(text : str):
    try:
        text = text.lower()
        text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
        text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
        text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
        text = re.sub('\t', ' ',  text)
        text = re.sub(r" +", ' ', text)
        text = text.strip(' ')
        text = ' '.join([x for x in word_tokenize(text) if x not in stop_words])
    except Exception as e :
        print(e)
    finally:
        return text

df['tweets'] = df['text'].apply(preprocesing_text)

In [10]:
df = df[['tweets','sentiment']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 5000
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweets     5000 non-null   object
 1   sentiment  5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.2+ KB


In [11]:
train,test = train_test_split(df,test_size=0.2,random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train.shape,test.shape

((4000, 2), (1000, 2))

In [12]:
import torch

class TweetsDataset:
    def __init__(self,tweets, targets, tokenizer, max_len):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self,item):
        review = str(self.tweets[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(review,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              truncation=True,
                                              return_token_type_ids=False,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              )

        return {'input_ids':torch.as_tensor(encoding['input_ids'],dtype=torch.long),
                'attention_mask':torch.as_tensor(encoding['attention_mask'], dtype=torch.long),
                'targets':target}

In [13]:
from torch.utils.data import DataLoader

def create_data_loader(df,tokenizer,max_len,params):
    ds = TweetsDataset(df.tweets, df.sentiment, tokenizer, max_len)
    return DataLoader(ds,**params)

In [14]:
PARAMS = {'batch_size': 4,
          'shuffle': True,
          'num_workers': 0
          }

MAX_LEN = 512
EPOCHS = 2
LEARNING_RATE = 1e-05

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [15]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [16]:
train_data_loader = create_data_loader(train,tokenizer,MAX_LEN,PARAMS)
test_data_loader = create_data_loader(test,tokenizer,MAX_LEN,PARAMS)

In [17]:
data = next(iter(test_data_loader))
data.keys()

dict_keys(['input_ids', 'attention_mask', 'targets'])

In [18]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [19]:
model = DistillBERTClass()
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [20]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [21]:
from sklearn.metrics import accuracy_score
def train_model(data_loader,device,params):
    losses = []
    acc = 0
    counter = 0

    model.train()
    for _,data in enumerate(data_loader, 0):

        optimizer.zero_grad()

        ids = data['input_ids'].reshape(params['batch_size'],MAX_LEN).to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        #data['targets'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        _, predictions = torch.max(outputs.data, dim=1)

        targets = targets.cpu().detach().numpy()
        predictions = predictions.cpu().detach().numpy()
        losses.append(loss.item())
        acc += accuracy_score(targets,predictions)

        loss.backward()
        optimizer.step()

        counter += 1


    return acc/counter, np.mean(losses)

In [22]:
for epoch in range(EPOCHS):
    accuracy, loss = train_model(train_data_loader,device,PARAMS)
    print(f"Epoch - {epoch}, Accuracy - {accuracy}, Loss - {loss}")

Epoch - 0, Accuracy - 0.8075, Loss - 0.4372141460441053
Epoch - 1, Accuracy - 0.89075, Loss - 0.2861591386795044


In [23]:
def eval_model(model, data_loader, device,params):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data["input_ids"].reshape(params['batch_size'],MAX_LEN).to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs, targets)
            _, predictions = torch.max(outputs.data, dim=1)

            targets = targets.cpu().detach().numpy()
            predictions = predictions.cpu().detach().numpy()

            losses.append(loss.item())
            acc += accuracy_score(targets,predictions)

            counter += 1

    return acc / counter, np.mean(losses)

In [24]:
acc, loss = eval_model(model, test_data_loader, device, PARAMS)
acc, loss

(0.87, 0.3269175323247909)