Transformer 

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# load pretrained model from huggingface
model_name = 'shashanksrinath/News_Sentiment_Analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name) #from_tf=True

# use gpu if no then default cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [3]:
# load dataset
data = pd.read_csv('Cleaned_News_Articles_Final2.csv')
data = data[['headline']]

In [4]:
def preprocess(text):
    inputs = tokenizer.encode_plus(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to CPU
    return inputs

# sentiment function
def get_sentiment(inputs):
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment = torch.argmax(probs)
    return sentiment.item(), probs.tolist()[0]

# apply tqdm to track progress 
tqdm.pandas()  
data['inputs'] = data['headline'].progress_apply(preprocess)
data['sentiment'], data['probs'] = zip(*data['inputs'].progress_apply(get_sentiment))

# mapping sentiment to labels
sentiment_dict = {0: 'negative', 1: 'positive', 2: 'neutral'}
data['sentiment'] = data['sentiment'].map(sentiment_dict)

# split the probabilities into nega posi neutral
data[['prob_negative', 'prob_positive', 'prob_neutral']] = pd.DataFrame(data['probs'].tolist(), index=data.index)
# data = data.drop(columns=['probs'])

100%|██████████| 41795/41795 [00:20<00:00, 1997.31it/s]
100%|██████████| 41795/41795 [47:19<00:00, 14.72it/s] 


In [5]:
data.to_csv('Pretrained_Predicited_Sentiment.csv', index=False)

In [6]:
print(data['sentiment'].value_counts())

positive    34076
negative     6870
neutral       849
Name: sentiment, dtype: int64


In [8]:
import os
import joblib

# create subfolders
base_folder = '5 Unsupervised Sentiment Analysis/transformer'
models_folder = os.path.join(base_folder, 'models')
tokenizer_folder = os.path.join(base_folder, 'tokenizer')

# double confirm 
os.makedirs(models_folder, exist_ok=True)
os.makedirs(tokenizer_folder, exist_ok=True)

# save model and tokenizer 
joblib.dump(model, os.path.join(models_folder, 'shashanksrinath_News_Sentiment_Analysis.pkl'))    #CHANGE model name and dataset name
print('Model saved')

joblib.dump(tokenizer, os.path.join(tokenizer_folder, 'shashanksrinath_News_Sentiment_Analysis.pkl'))    #CHANGE also
print('Tokenizer saved')

Model saved
Tokenizer saved


In [11]:
import joblib

# load the saved model and tokenizer
model_filename = '5 Unsupervised Sentiment Analysis/transformer/models/shashanksrinath_News_Sentiment_Analysis.pkl'
tokenizer_filename = '5 Unsupervised Sentiment Analysis/transformer/tokenizer/shashanksrinath_News_Sentiment_Analysis.pkl'

model = joblib.load(model_filename)
tokenizer = joblib.load(tokenizer_filename)

# encode the label
label_encoding = {0: 'negative', 1: 'positive', 2: 'neutral'}

# unseen data
# input_text = "McDonald’s shortens breakfast time in Australia as bird flu causes egg shortage"
input_text = "Tesla sales fall again as more automakers crowd electric vehicle market"

# vector the unseen data
transformed_input = tokenizer.transform([input_text])

# predit the unseen data by usig saved model
prediction = model.predict(transformed_input)

# decode the label into its original class
decoded_prediction = label_encoding[prediction[0]]

# print result
print(f"Prediction: {decoded_prediction}")

AttributeError: 'RobertaTokenizerFast' object has no attribute 'transform'

In [12]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define paths for saving and loading
base_folder = '5 Unsupervised Sentiment Analysis/transformer'
models_folder = os.path.join(base_folder, 'modelss')
tokenizer_folder = os.path.join(base_folder, 'tokenizers')

# Create subfolders if they don't exist
os.makedirs(models_folder, exist_ok=True)
os.makedirs(tokenizer_folder, exist_ok=True)

# Model and tokenizer names
model_name = 'shashanksrinath/News_Sentiment_Analysis'

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Save tokenizer and model
tokenizer.save_pretrained(tokenizer_folder)
model.save_pretrained(models_folder)

# Load tokenizer and model from the saved paths
loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
loaded_model = AutoModelForSequenceClassification.from_pretrained(models_folder)

# Example of using loaded tokenizer and model on new data
text = "Your new headline text here."
inputs = loaded_tokenizer.encode_plus(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
outputs = loaded_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
sentiment = torch.argmax(probs)
sentiment_label = {0: 'negative', 1: 'neutral', 2: 'positive'}[sentiment.item()]

print(f"Predicted sentiment: {sentiment_label}")


Predicted sentiment: neutral
