In [1]:
import torch
import os
import re
import shutil
import string
from collections import Counter

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

In [3]:
def get_sentiment(sentiment):
    if sentiment == -1:
        return 0
    elif sentiment == 1:
        return 1
    else:
        return 2

In [4]:
train_data= pd.read_excel("7.xlsx")
train_data.dropna(axis = 0, how ='any',inplace=True) 
train_data['Num_words_text'] = train_data['news'].apply(lambda x:len(str(x).split())) 
mask = train_data['Num_words_text'] >2
train_data = train_data[mask]
print('-------Train data--------')
print(train_data['news'].value_counts())
print(len(train_data))
print('-------------------------')
max_train_sentence_length  = train_data['Num_words_text'].max()


train_data['news'] = train_data['news'].apply(remove_emoji)
train_data['news'] = train_data['news'].apply(remove_url)
train_data['news'] = train_data['news'].apply(clean_text)

train_data['label'] = train_data['class'].apply(get_sentiment)
print('Train Max Sentence Length :'+str(max_train_sentence_length))
#print('Test Max Sentence Length :'+str(max_test_sentence_length))

-------Train data--------
news
 !Пожилого уфимца, отсидевшего за чужое преступление 13 лет, реабилитировали Фатхулла Исхаков отсидел 13 лет за чужое преступление и полвека добивался оправдания. Как сообщается в группе телепроекта [club187046624|ЧЕСТНО ГОВОРЯ | новости Уфы и Башкирии], впервые в истории страны дело с истёкшей 50 лет назад исковой давностью сначала было передано в Шестой кассационный суд, а затем и в Верховный суд России. Решение об отмене приговора 1959 года принял сегодня Верховный суд Российской Федерации. Конституционный суд России 21 декабря 2021 года разрешил судам в отдельных особых случаях пересматривать приговоры по новым обстоятельствам вопреки позиции прокурора. Соответствующее постановление в ответ на жалобу осужденного в 1959 году жителя Уфы Фатхуллы Исхакова опубликовали на официальном сайте суда. Уфимец был приговорен к лишению свободы по обвинению в покушении на убийство. 13 лет он провел в колонии. Мужчина утверждал, что ни в чем не виноват, и после осво

In [5]:
train_data.head(10)

Unnamed: 0,news,class,Num_words_text,label
0,кигинском районе башкирии откроют филиал парка...,1,134,1
1,многодетная мама уфы помогает малышам экстрема...,1,158,1
2,башкирия получила развитие туризма млн рублей ...,1,126,1
3,уфе состоится ночной забег июля уфе стартует н...,1,78,1
4,башкирию поступили обновленные 100рублевые бан...,1,28,1
5,писателю башкирии присудили престижную междуна...,1,81,1
6,самый известный таксист уфы собрал для онкобол...,1,174,1
7,уфимские предприниматели помогают ремонтироват...,1,200,1
8,глава башкирии подписал указ праздновании 100л...,1,82,1
9,уфимка мечтающая стать врачом сдала «химию» ба...,1,68,1


In [6]:
X_train, X_valid, Y_train, Y_valid= train_test_split(train_data['news'].tolist(),\
                                                      train_data['label'].tolist(),\
                                                      test_size=0.2,
                                                      stratify = train_data['label'].tolist(),
                                                      random_state=0)


print('Train data len:'+str(len(X_train)))
print('Class distribution'+str(Counter(Y_train)))


print('Valid data len:'+str(len(X_valid)))
print('Class distribution'+ str(Counter(Y_valid)))


train_dat =list(zip(Y_train,X_train))
valid_dat =list(zip(Y_valid,X_valid))


Train data len:540
Class distributionCounter({1: 241, 2: 182, 0: 117})
Valid data len:135
Class distributionCounter({1: 61, 2: 45, 0: 29})


In [7]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device='CPU'

  return torch._C._cuda_getDeviceCount() > 0


In [8]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('spacy', language='ru_core_news_sm')
train_iter = train_dat
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


conda install -c conda-forge spacy-model-ru_core_news_sm

In [9]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) 

In [10]:
text_pipeline('Трогательное и нежное стихотворение о любви')

[0, 0, 0, 0, 0, 797]

In [11]:


label_pipeline('1')



1

In [12]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


In [13]:
from torch import nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim,64)
        self.fc2 = nn.Linear(64,16)
        self.fc3 = nn.Linear(16, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [14]:
!CUDA_LAUNCH_BLOCKING=1

In [15]:
train_iter1 = train_dat
num_class = len(set([label for (label, text) in train_iter1]))
print(num_class)
vocab_size = len(vocab)
emsize = 128
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)


3


In [39]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        #print(1)
        optimizer.zero_grad()
        #print(2)
        predited_label = model(text, offsets)
        #print(3)
        loss = criterion(predited_label, label)
        #print(4)
        loss.backward()
        #print(5)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        #print(6)
        optimizer.step()
        
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


In [40]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 20 # epoch
LR =10  # learning rate
BATCH_SIZE = 4 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_iter2 = train_dat
valid_iter2= valid_dat




train_dataloader = DataLoader(train_iter2, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_iter2, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)


for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

NotImplementedError: Could not run 'aten::_foreach_norm.Scalar' with arguments from the 'SparseCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_foreach_norm.Scalar' is only available for these backends: [CPU, CUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/build/aten/src/ATen/RegisterCPU.cpp:31188 [kernel]
CUDA: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/build/aten/src/ATen/RegisterCUDA.cpp:44143 [kernel]
BackendSelect: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/PythonFallbackKernel.cpp:153 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/functorch/DynamicLayer.cpp:498 [backend fallback]
Functionalize: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/FunctionalizeFallbackKernel.cpp:290 [backend fallback]
Named: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/native/NegateFallback.cpp:19 [backend fallback]
ZeroTensor: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/VariableFallbackKernel.cpp:86 [backend fallback]
AutogradOther: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradCPU: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradCUDA: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradHIP: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradXLA: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradMPS: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradIPU: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradXPU: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradHPU: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradVE: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradLazy: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradMTIA: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradPrivateUse1: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradPrivateUse2: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradPrivateUse3: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradMeta: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
AutogradNestedTensor: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/VariableType_2.cpp:18610 [autograd kernel]
Tracer: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/torch/csrc/autograd/generated/TraceType_2.cpp:17079 [kernel]
AutocastCPU: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/autocast_mode.cpp:382 [backend fallback]
AutocastCUDA: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/autocast_mode.cpp:249 [backend fallback]
FuncTorchBatched: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:710 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/functorch/VmapModeRegistrations.cpp:28 [backend fallback]
Batched: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/functorch/TensorWrapper.cpp:203 [backend fallback]
PythonTLSSnapshot: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/PythonFallbackKernel.cpp:161 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/functorch/DynamicLayer.cpp:494 [backend fallback]
PreDispatch: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/PythonFallbackKernel.cpp:165 [backend fallback]
PythonDispatcher: registered at /opt/conda/conda-bld/pytorch_1695392035891/work/aten/src/ATen/core/PythonFallbackKernel.cpp:157 [backend fallback]
