In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# sentiment analysis
nltk.download('vader_lexicon')

# load data
data = pd.read_csv('./archive/abcnews-date-text.csv')
data = data.sample(500)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
# sentiment analysis
sid = SentimentIntensityAnalyzer()
data['sentiment'] = data['headline_text'].apply(lambda x: sid.polarity_scores(x))
data['compound'] = data['sentiment'].apply(lambda x: x['compound'])
data['comp_score'] = data['compound'].apply(lambda x: 'pos' if x >= 0 else 'neg')

print(data.head())


        publish_date                                      headline_text  \
366461      20080227              man in court over scissors stab death   
532006      20100416                              interview kevin moore   
985688      20151015  ashleigh barty cricket big bash tennis brisban...   
69561       20040130             residents evacuated after tas flooding   
722310      20120905                                the culture quarter   

                                                sentiment  compound comp_score  
366461  {'neg': 0.606, 'neu': 0.394, 'pos': 0.0, 'comp...   -0.8271        neg  
532006  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...    0.0000        pos  
985688  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...    0.0000        pos  
69561   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...    0.0000        pos  
722310  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...    0.0000        pos  


In [18]:
# Load pre-trained model and tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for sentiment analysis
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Example text
text = "I love using transformers. They make life so much easier!"


# Perform sentiment analysis
result = classifier(" ".join(data['headline_text']))
print(result)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (898 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The expanded size of the tensor (898) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 898].  Tensor sizes: [1, 514]

In [44]:
import pandas as pd
import nltk
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline

nltk.download('vader_lexicon')

# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Create a pipeline for sentiment analysis
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Sample data loading and preprocessing
data = pd.read_csv('./archive/abcnews-date-text.csv')
data = data.sample(10000)  # Smaller sample for demonstration

# Initialize counters
positive, neutral, negative = 0, 0, 0

# Define thresholds for sentiment classification
positive_threshold = 0.6
negative_threshold = 0.4

# Analyze sentiments and categorize
for text in data['headline_text']:
    if len(text.split()) > 50:  # Rough check to avoid token overflow
        text = " ".join(text.split()[:50])  # Trim to safe length
    result = classifier(text)[0]
    score = result['score']
    if score > positive_threshold:  # Assuming LABEL_1 is positive
        positive += 1
    elif score < negative_threshold:  # Assuming LABEL_0 is negative
        negative += 1
    else:
        neutral += 1

# Print results
total = positive + neutral + negative
print(f"Positive: {positive / total:.2f}")
print(f"Neutral: {neutral / total:.2f}")
print(f"Negative: {negative / total:.2f}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Positive: 0.00
Neutral: 1.00
Negative: 0.00


In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

In [34]:
# Example setup
tokenizer = Tokenizer(num_words=5000)
texts = ["Sample text data for training"]
tokenizer.fit_on_texts(texts)

# Example of loading GloVe embeddings
embedding_matrix = np.zeros((5000, 100))  # Assuming 100-dim GloVe vectors

model = Sequential()
# model.add(Embedding(input_dim=5000, output_dim=100, weights=[embedding_matrix], trainable=False))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [50]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load the data
data = pd.read_csv('./archive/abcnews-date-text.csv')
data = data.sample(100000)  # Smaller sample for demonstration


In [51]:
# Download VADER lexicons
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to get sentiment category
def get_sentiment_category(score):
    if score['compound'] > 0.05:
        return 'positive'
    elif score['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
data['sentiment'] = data['headline_text'].apply(lambda text: sia.polarity_scores(text))
data['sentiment_category'] = data['sentiment'].apply(get_sentiment_category)

# Calculate percentage of each sentiment category
sentiment_counts = data['sentiment_category'].value_counts(normalize=True) * 100

# Display the sentiment scores
print(f"Positive: {sentiment_counts.get('positive', 0):.2f}%")
print(f"Neutral: {sentiment_counts.get('neutral', 0):.2f}%")
print(f"Negative: {sentiment_counts.get('negative', 0):.2f}%")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Positive: 20.38%
Neutral: 45.34%
Negative: 34.28%
