<a href="https://colab.research.google.com/github/Dreadnought73/AI_projects/blob/main/Sentiment_analysis_ALBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with ALBERT

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification, AlbertConfig
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [None]:
finance = pd.read_csv('/content/drive/MyDrive/Coding_data_files/finance_sentiment_analysis.csv', encoding='latin-1')
finance.head()

Setting new columns for easier accessibility during the development.

In [None]:
finance.columns = ['Sentiment', 'Text']

In [None]:
finance.head()

The neutral category has an overwhelming majority compared to the other categories.

In [None]:
finance['Sentiment'].value_counts()

In [None]:
label_encoder = LabelEncoder()
finance['encoded_sentiments'] = label_encoder.fit_transform(finance['Sentiment'])

# Map encoded labels back to sentiment names for reference
# This creates a dictionary to easily look up the sentiment name from the encoded integer
encoded_to_label = {i: label for i, label in enumerate(label_encoder.classes_)}
print(f"Encoded labels mapping: {encoded_to_label}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    finance['Text'],
    finance['encoded_sentiments'],
    test_size=.2,
    random_state=42,
    stratify=finance['encoded_sentiments']
)

print(f"\nLoading ALBERT tokenizer and model: {'albert-base-v2'}")
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=3)
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2', config=config)

This is to find the optimal max_seq_length for the final tokenizer

In [None]:
encoded_inputs = tokenizer(
    list(finance['Text'].values),
    truncation=False,
    padding=False,
    return_attention_mask=False,
    return_token_type_ids=False,
    return_tensors=None
)

token_lengths = [len(input_ids) for input_ids in encoded_inputs['input_ids']]

print(f"Calculated lengths for {len(token_lengths)} sentences.")

Finding out where most the current sentences end to know what max_length to set.

In [None]:
print("\nAnalyzing token length distribution:")
print(f"  Min Length: {np.min(token_lengths)}")
print(f"  Max Length: {np.max(token_lengths)}")
print(f"  Mean Length: {np.mean(token_lengths):.2f}")
print(f"  Median Length: {np.median(token_lengths)}")


percentiles = [90, 95, 99]
for p in percentiles:
    length = np.percentile(token_lengths, p)
    print(f"  {p}th Percentile Length: {length:.2f}")

Tokenizing the train and test set and creating constant tensors.

In [None]:
train_encoding = tokenizer(
    list(X_train.values),
    truncation=True,
    padding=True,
    max_length=63,
    return_tensors='tf'
)

test_encoding = tokenizer(
    list(X_test.values),
    truncation=True,
    padding=True,
    max_length=63,
    return_tensors='tf'
)

y_train_tf = tf.constant(y_train.values)
y_test_tf = tf.constant(y_test.values)

Building and training the model.

In [None]:
import tensorflow.compat.v1 as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # loss function
metrics = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') # defining the metrics for evaluation
model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # putting everything together

In [None]:
# it pairs up each sentence's tokenized encodings (as a dictionary) with its corresponding label (tf.data.Dataset.from_tensor_slices)
print("\nFinetuning the ALBERT model...")

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encoding), # dictionary containing tensors like input_ids, attention_masks:
                          # Each "slice" from this will be a dictionary containing one sequence of input IDs, one attention mask, etc., corresponding to a single sentence
    y_train_tf)).batch(24)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encoding), y_test_tf)).batch(24)

history = model.fit(
    train_dataset,
    epochs=5,
    validation_data=test_dataset
    )

print("Model finetuning complete.")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss During Training')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
sample_sentences = [
    "The stock market saw mixed movements this week as investors digested recent economic data. Trading volume remained moderate across major exchanges.",
    "Strong job growth numbers indicate a strengthening economy. This is a positive sign for future economic expansion.",
    "Supply chain disruptions continue to weigh on manufacturing output. This could impact inventory levels and consumer prices.",
    "Inflation figures were released today, showing a slight change from the previous month. Analysts are considering the potential impact on consumer spending.",
    "Several companies reported better-than-expected earnings this quarter. This performance is boosting confidence in those sectors.",
    "Geopolitical tensions are creating uncertainty in global markets. Investors are showing caution due to increased risks.",
    "A key economic indicator showed a contraction in the last quarter. This has raised concerns about the pace of economic recovery.",
    "Market sentiment improved today with a broad rally across technology stocks. Innovation continues to drive growth in the sector.",
    "The central bank announced its decision on interest rates this afternoon. Rates will remain unchanged for the current period."
]

sample_encodings = tokenizer(
    sample_sentences,
    truncation=True,
    padding=True,
    max_length=70,
    return_tensors='tf'
)

predictions = model.predict(dict(sample_encodings))

predicted_logits = predictions.logits
predicted_classes = tf.argmax(predicted_logits, axis=1).numpy() # Get the index of the highest logit

# Convert predicted class indices back to sentiment labels
predicted_sentiments = [encoded_to_label[class_idx] for class_idx in predicted_classes]

for sentence, sentiment in zip(sample_sentences, predicted_sentiments):
    print(f"Sentence: '{sentence}' -> Predicted Sentiment: {sentiment}")

**After handling class imbalance, we could receive probbaly a better result.**