<a href="https://colab.research.google.com/github/AndreasTheodoulou/NLP/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas   as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
from sklearn.datasets import fetch_20newsgroups
# Fetch data
data = fetch_20newsgroups(subset='all')['data'][:100]

In [None]:
from transformers import pipeline, DistilBertTokenizer

# Initialize sentiment analysis pipeline
sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    device=0  # Use GPU (device=0 for CUDA)
)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Maximum tokens for the model
max_tokens = 512

def chunk_text(text, max_tokens=512):
    """
    Breaks a long text into chunks of up to max_tokens using tokenizer truncation.
    """
    # Tokenize text and ensure truncation within max_tokens
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    # Decode tokens back into text chunks strictly limited to max_tokens
    return [tokenizer.decode(chunk[:max_tokens], skip_special_tokens=True) for chunk in chunks]

# Calculate sentiment scores
direction_mapping = {"POSITIVE": 1, "NEGATIVE": -1}
sentiment_scores = []

for doc in data:
    # Split document into chunks within max_tokens
    chunks = chunk_text(doc, max_tokens=max_tokens)

    # Process chunks individually to ensure compliance with max_tokens
    sentiment_outputs = [sentiment_model(chunk[:max_tokens])[0] for chunk in chunks]

    # Calculate sentiment scores and weights for each chunk
    sentiment_scores_chunk = [
        output["score"] * direction_mapping[output["label"]]
        for output in sentiment_outputs
    ]
    weights_chunk = [len(tokenizer.encode(chunk, add_special_tokens=False)) / max_tokens for chunk in chunks]

    # Calculate weighted average sentiment score
    weighted_sentiment_score = np.average(sentiment_scores_chunk, weights=weights_chunk)
    sentiment_scores.append(weighted_sentiment_score)

# Combine data into a DataFrame
sentiment_df = data
sentiment_df['Sentiment'] = sentiment_scores

In [None]:
sentiment_df.Sentiment.mean()

In [None]:
sentiment_df.Sentiment.hist()