### Sentimental analysis by twitter specific pre-trained model ###

In [2]:
import pandas as pd

data_path = "data/twitter-datasets/"
test_path = f"{data_path}test_data.txt"


with open(test_path, "r") as f:
    test_tweets = [line.strip() for line in f]

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm
# Load binary sentiment model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = []

# Analyze sentiment
for tweet in tqdm(test_tweets):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    label = probs.argmax().item() if probs.argmax().item() == 1 else -1
    labels.append(label)
    sentiment = "positive" if probs.argmax() == 1 else "negative"

ids = np.arange(1, len(labels) + 1)

100%|██████████| 10000/10000 [02:28<00:00, 67.42it/s]


In [8]:
from helpers import create_csv_submission

create_csv_submission(ids, labels, "data/submission_sentimental_analysis_distilbert.csv")



In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm
# Load binary sentiment model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = []

# Analyze sentiment
for tweet in tqdm(test_tweets):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    negative_prob, neutral_prob, positive_prob = probs[0].tolist()
    if positive_prob > negative_prob:
        label = 1  # Positive
    else:
        label = -1  # Negative
    labels.append(label)

ids = np.arange(1, len(labels) + 1)

100%|██████████| 10000/10000 [05:05<00:00, 32.74it/s]


In [13]:
from helpers import create_csv_submission

create_csv_submission(ids, labels, "data/submission_sentimental_analysis_twitter_roberta.csv")

