In [1]:
import os

import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer
import torch
from pathlib import Path
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [3]:
MODEL = '../../Sentiment_Analysis/turkish-sentiment-XMLRoBERTa/2_Label_Twitter/'

In [4]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', use_fast=True)

In [5]:
data = pd.read_csv(f'../../StockAndTweetDataExtract/THYAO/2021/tweet_part0.csv')
for i in range(1,6):
    data = pd.concat([data, pd.read_csv(f'../../StockAndTweetDataExtract/THYAO/2021/tweet_part{i}.csv')])

In [6]:
data.drop_duplicates(subset=['Tweet'])
data

Unnamed: 0,Date,User,Tweet
0,2021-02-28 23:37:48+00:00,guneshakan73,#thyao haftalık dolar bazı şeyleri aramaya ger...
1,2021-02-28 23:36:26+00:00,bogagrafikleri,"#thyao eğitim çalışmasıdır, yatırım tavsiyesi ..."
2,2021-02-28 23:31:03+00:00,ParaBorsaNet,"OKAN ÖZDEMİR: ASELS, GARAN VE THYAO GRAFIK ANA..."
3,2021-02-28 23:25:22+00:00,sezgin_akyol,#thyao haftalık (usd) https://t.co/YzqImOZCc9
4,2021-02-28 23:08:24+00:00,TraderXman,Bu hafta #bist için yön belirleme haftası olac...
...,...,...,...
9283,2021-11-01 05:55:07+00:00,nusret254,Günaydın Dostlar\n\nEkim ayı toplam kar-zarar'...
9284,2021-11-01 05:37:36+00:00,DevreKesici,"Günaydın arkadaşlar,güzel bir gün geçirmeniz d..."
9285,2021-11-01 05:28:57+00:00,gazetebankacom,Kasım ayının ilk hisse önerileri listesi: ICBC...
9286,2021-11-01 05:05:16+00:00,TELATBEK,#kords. #mavı. #thyao. #ykbnk. Günlük teknik a...


In [7]:
data_p = data.Tweet.apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=256))
data_p

0       [input_ids, attention_mask]
1       [input_ids, attention_mask]
2       [input_ids, attention_mask]
3       [input_ids, attention_mask]
4       [input_ids, attention_mask]
                   ...             
9283    [input_ids, attention_mask]
9284    [input_ids, attention_mask]
9285    [input_ids, attention_mask]
9286    [input_ids, attention_mask]
9287    [input_ids, attention_mask]
Name: Tweet, Length: 61458, dtype: object

In [8]:
data['Tweet'] = data_p
data_p = 0

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)
classifier = Trainer(model)

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings['Tweet']

    def __getitem__(self, idx):
        item = {"input_ids": torch.tensor(self.encodings.iloc[idx]["input_ids"])}
        item['attention_mask'] = torch.tensor(self.encodings.iloc[idx]["attention_mask"])
        return item

    def __len__(self):
        return len(self.encodings)

dataset = MyDataset(data)

In [11]:
test_preds_raw, test_labels , _ = classifier.predict(dataset)

***** Running Prediction *****
  Num examples = 61458
  Batch size = 8


In [12]:
final_data = pd.concat([data['Date'].reset_index(drop=True), pd.DataFrame(np.argmax(test_preds_raw, axis=-1), columns=['Sentiment'])], axis=1)
final_data

Unnamed: 0,Date,Sentiment
0,2021-02-28 23:37:48+00:00,0
1,2021-02-28 23:36:26+00:00,0
2,2021-02-28 23:31:03+00:00,1
3,2021-02-28 23:25:22+00:00,1
4,2021-02-28 23:08:24+00:00,1
...,...,...
61453,2021-11-01 05:55:07+00:00,1
61454,2021-11-01 05:37:36+00:00,1
61455,2021-11-01 05:28:57+00:00,1
61456,2021-11-01 05:05:16+00:00,1


In [13]:
final_data.to_csv('2021.csv')