In [1]:
import os

import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer
import torch
from pathlib import Path
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [3]:
MODEL = '../../Sentiment_Analysis/turkish-sentiment-XMLRoBERTa/2_Label_Twitter/'

In [4]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', use_fast=True)

In [5]:
data = pd.read_csv(f'../../StockAndTweetDataExtract/THYAO/2022/tweet_part0.csv')
for i in range(1,6):
    data = pd.concat([data, pd.read_csv(f'../../StockAndTweetDataExtract/THYAO/2022/tweet_part{i}.csv')])

In [6]:
data.drop_duplicates(subset=['Tweet'])
data

Unnamed: 0,Date,User,Tweet
0,2022-02-28 23:39:13+00:00,mrtglrx,#thyao herkes rusyaya hava sahasını kapatmışke...
1,2022-02-28 22:10:28+00:00,sarigulHalill,Biz bunu kendi yanımızda olanlara zamanında ya...
2,2022-02-28 21:47:32+00:00,tolgaserdar3,#thyao 144 milyonluk Rusya için tüm euro desti...
3,2022-02-28 21:47:28+00:00,Nazende2011,#thyao \nBeyin fırtınası yapalım. Tamamen bilg...
4,2022-02-28 21:40:57+00:00,kydostubist,#thyao https://t.co/HOPAPFnSHY
...,...,...,...
449,2022-11-01 00:29:38+00:00,TraderBeyy,#THYAO \n\n4 SAATLİK GRAFİK\n\n98.50 DESTEK NO...
450,2022-11-01 00:19:37+00:00,ariftaskaya07,#Vkgyo tekrardan listemde. 2 saatlikte bir dön...
451,2022-11-01 00:16:49+00:00,Kenan221947581,#BRKSN yazılan çizilen hiçbiri utandırmiyor ma...
452,2022-11-01 00:07:58+00:00,Kenan221947581,#BOBET daha dün yazdık çizdik bugün maşallahli...


In [7]:
data_p = data.Tweet.apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=256))
data_p

0      [input_ids, attention_mask]
1      [input_ids, attention_mask]
2      [input_ids, attention_mask]
3      [input_ids, attention_mask]
4      [input_ids, attention_mask]
                  ...             
449    [input_ids, attention_mask]
450    [input_ids, attention_mask]
451    [input_ids, attention_mask]
452    [input_ids, attention_mask]
453    [input_ids, attention_mask]
Name: Tweet, Length: 66537, dtype: object

In [8]:
data['Tweet'] = data_p
data_p = 0

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)
classifier = Trainer(model)

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings['Tweet']

    def __getitem__(self, idx):
        item = {"input_ids": torch.tensor(self.encodings.iloc[idx]["input_ids"])}
        item['attention_mask'] = torch.tensor(self.encodings.iloc[idx]["attention_mask"])
        return item

    def __len__(self):
        return len(self.encodings)

dataset = MyDataset(data)

In [11]:
test_preds_raw, test_labels , _ = classifier.predict(dataset)

***** Running Prediction *****
  Num examples = 66537
  Batch size = 8


In [12]:
final_data = pd.concat([data['Date'].reset_index(drop=True), pd.DataFrame(np.argmax(test_preds_raw, axis=-1), columns=['Sentiment'])], axis=1)
final_data

Unnamed: 0,Date,Sentiment
0,2022-02-28 23:39:13+00:00,0
1,2022-02-28 22:10:28+00:00,0
2,2022-02-28 21:47:32+00:00,0
3,2022-02-28 21:47:28+00:00,0
4,2022-02-28 21:40:57+00:00,1
...,...,...
66532,2022-11-01 00:29:38+00:00,0
66533,2022-11-01 00:19:37+00:00,1
66534,2022-11-01 00:16:49+00:00,1
66535,2022-11-01 00:07:58+00:00,1


In [13]:
final_data.to_csv('2022.csv')