In [1]:
import os

import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer
import torch
from pathlib import Path
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [3]:
MODEL = '../../Sentiment_Analysis/turkish-sentiment-XMLRoBERTa/2_Label_Twitter/'

In [4]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', use_fast=True)

In [5]:
data = pd.read_csv(f'../../StockAndTweetDataExtract/Bist100/2021/tweet_part0.csv')
for i in range(1,6):
    data = pd.concat([data, pd.read_csv(f'../../StockAndTweetDataExtract/Bist100/2021/tweet_part{i}.csv')])

In [6]:
data.drop_duplicates(subset=['Tweet'])
data

Unnamed: 0,Date,User,Tweet
0,2021-02-28 23:38:10+00:00,bogagrafikleri,"#bıst100 #endeks eğitim çalışmasıdır, yatırım ..."
1,2021-02-28 23:34:27+00:00,byhasanyilmaz,Sabaha kadar piyasa bozmazsa 1485 civarinda aç...
2,2021-02-28 23:01:19+00:00,Ayhanmg34,ağalar paşalar kendilerine baksınlar sümük old...
3,2021-02-28 22:58:40+00:00,Ayhanmg34,#bist100 #BIST30 #bist yarın yön belli demeyi ...
4,2021-02-28 22:41:14+00:00,sherloc11872501,Abd 10 yillik tahvillerinde yon asagi #bist #b...
...,...,...,...
43427,2021-11-01 02:02:04+00:00,Haber34I,Mohammed Alsaloussi'den ayrılan Şeyma Subaşı’n...
43428,2021-11-01 01:38:18+00:00,OZGUR_KUS31,@zeynepxaktas #bist100 #kordsa birgün herkes k...
43429,2021-11-01 00:48:52+00:00,MR_Guclu_,#snpam #XU100 #BIST30 #bist #bist100 \nAna tre...
43430,2021-11-01 00:32:27+00:00,kuzeyborsa,"Haftalık Takip Listem=01.11.2021=\n#VAKKO =9,3..."


In [7]:
data_p = data.Tweet.apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=256))
data_p

0        [input_ids, attention_mask]
1        [input_ids, attention_mask]
2        [input_ids, attention_mask]
3        [input_ids, attention_mask]
4        [input_ids, attention_mask]
                    ...             
43427    [input_ids, attention_mask]
43428    [input_ids, attention_mask]
43429    [input_ids, attention_mask]
43430    [input_ids, attention_mask]
43431    [input_ids, attention_mask]
Name: Tweet, Length: 276449, dtype: object

In [8]:
data['Tweet'] = data_p
data_p = 0

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)
classifier = Trainer(model)

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings['Tweet']

    def __getitem__(self, idx):
        item = {"input_ids": torch.tensor(self.encodings.iloc[idx]["input_ids"])}
        item['attention_mask'] = torch.tensor(self.encodings.iloc[idx]["attention_mask"])
        return item

    def __len__(self):
        return len(self.encodings)

dataset = MyDataset(data)

In [11]:
test_preds_raw, test_labels , _ = classifier.predict(dataset)

***** Running Prediction *****
  Num examples = 276449
  Batch size = 8


In [12]:
final_data = pd.concat([data['Date'].reset_index(drop=True), pd.DataFrame(np.argmax(test_preds_raw, axis=-1), columns=['Sentiment'])], axis=1)
final_data

Unnamed: 0,Date,Sentiment
0,2021-02-28 23:38:10+00:00,0
1,2021-02-28 23:34:27+00:00,1
2,2021-02-28 23:01:19+00:00,0
3,2021-02-28 22:58:40+00:00,0
4,2021-02-28 22:41:14+00:00,1
...,...,...
276444,2021-11-01 02:02:04+00:00,0
276445,2021-11-01 01:38:18+00:00,1
276446,2021-11-01 00:48:52+00:00,1
276447,2021-11-01 00:32:27+00:00,0


In [13]:
final_data.to_csv('2021.csv')