## XML pre trained model 

See papers below for more informations: 
* https://arxiv.org/pdf/2302.13795.pdf
* https://aclanthology.org/2022.lrec-1.27/
* https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment 


### The model

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

device    = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

### Prediction 

In [None]:
import pandas as pd

df = pd.read_csv('dataset/sentiment_chatgpt.csv', usecols=['user_name', 'user_description', 'clean_tweets','tweets','label', 'date'])

df.isnull().values.any()
df.isnull().sum()

In [None]:
df = df.sort_values(by='date')

In [None]:
df.date = pd.to_datetime(df.date)

In [None]:
first = df[df.date.dt.month == 12] # first month
last = df[(df.date.dt.month == 4) ] # last month (april)

In [None]:
## Prediction of the first month: 

sentences = list(first["tweets"])

pred = []
# Perform sentiment analysis for each sentence
for i, text in enumerate(sentences):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input).to(device)
    scores = output.logits.detach().cpu().numpy()
    scores = softmax(scores)

    # Get label with highest score
    highest_score_label = config.id2label[np.argmax(scores)]
    pred.append(highest_score_label)

first["label"] = pred
first.to_csv('datasets/sentiment_first.csv')

In [None]:
## Prediction of the last month: 

sentences = list(last["tweets"])

pred = []
# Perform sentiment analysis for each sentence
for i, text in enumerate(sentences):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input).to(device)
    scores = output.logits.detach().cpu().numpy()
    scores = softmax(scores)

    # Get label with highest score
    highest_score_label = config.id2label[np.argmax(scores)]
    pred.append(highest_score_label)

last["label"] = pred
last.to_csv('datasets/sentiment_last.csv')