## BERT Model

In [None]:
# Load pre-trained model and tokenizer
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

In [None]:
# Function to get logits for each sentence
def get_logits(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    return logits

# Calculate sentiment for each sentence and then average for each article
def calculate_sentiment(sentences):
    sentiments = []
    for sentence in sentences:
        logits = get_logits(sentence)
        sentiment = logits[4] - logits[0]  # Assuming logits[4] is for positive and logits[0] is for negative
        sentiments.append(sentiment)
    return sum(sentiments) / len(sentiments) if sentiments else 0

df['Sentiment Score'] = df['Sentences'].apply(calculate_sentiment)

# Display the first 10 rows with sentiment scores
df.head(10)

In [None]:
#Data split 
# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Display the number of samples in each set
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")

# Print percentage split
total_samples = len(df)
train_percentage = (len(train_df) / total_samples) * 100
val_percentage = (len(val_df) / total_samples) * 100
test_percentage = (len(test_df) / total_samples) * 100

print(f"Training set: {train_percentage:.2f}%")
print(f"Validation set: {val_percentage:.2f}%")
print(f"Test set: {test_percentage:.2f}%")
