In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification, pipeline
import openai


##Load dataset

In [None]:
# Load dataset
data = pd.read_csv('dataset_labeled.csv')

In [None]:
data

##Preprocessing

In [None]:
                                             #remove new line symbons, replace http/https links with the word "link" and in case the text is just a link replace with NaN
data['message_processed'] = data.message.map(lambda x: x.replace('\n', '').strip())\
                                        .map(lambda x: ' '.join(['link' if word.startswith('http') else word for word in x.split()]))\
                                        .map(lambda x: None if ((len(x.split()) == 1) and (x == 'link')) else x)

In [None]:
data['message_length'] = data.message_processed.map(lambda x: 0 if x is None else len(x.split()))

In [None]:
data.groupby('sentiment').agg({'message_length':'mean'})

In [None]:
data['message_length'].describe()

In [None]:
data[data.message_length <= 2]

In [None]:
!pip install greek_stemmer

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from greek_stemmer import GreekStemmer  # External library for stemming Greek words
from spacy.lang.el import Greek

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Initialize Greek NLP tools
nlp = Greek()

# Load Greek stopwords
greek_stopwords = set(stopwords.words('greek'))
greek_stopwords.remove("δεν")

def preprocess_greek_text(text):
    """
    Preprocess Greek text for models.
    Steps: Lowercasing, stopword removal, punctuation removal, tokenization

    Args:
        text (str): Input Greek text.

    Returns:
        str: Preprocessed text as a single string.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Tokenize the text
    words = word_tokenize(text)

    # 3. Remove Greek stopwords
    words = [word for word in words if word not in greek_stopwords]

    # Join words back into a single string
    return " ".join(words)



In [None]:
# Process text for models without tokenizers
data['message_processed_for_nlp'] = data['message_processed'].apply(lambda x: None if x is None else preprocess_greek_text(x))

In [None]:
data = data.dropna(subset='message_processed').reset_index()

###Splitting datset

In [None]:
# Splitting the data

data_train, data_test = train_test_split(data, test_size=0.2, random_state=42, stratify=data['sentiment'])
#TRAIN SET and Y
X_train_transformer = data_train['message_processed']
X_train_ml = data_train['message_processed_for_nlp']

y_train = data_train['sentiment']
##TEST SET
X_test_transformer = data_test['message_processed']
X_test_ml = data_test['message_processed_for_nlp']

y_test = data_test['sentiment']

#Models

##Baseline: SVM

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_ml)
X_test_vec = vectorizer.transform(X_test_ml)

# Define parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],  # Fine-tune with gamma for RBF kernel
    'class_weight': [None, 'balanced']  # Handle imbalanced classes
}

# GridSearchCV with cross-validation
svm_model = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
svm_model.fit(X_train_vec, y_train)

In [None]:
# Predictions
svm_preds = svm_model.predict(X_test_vec)

# Best parameters and accuracy
print("Best SVM Parameters:", svm_model.best_params_)
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, svm_preds))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, svm_preds)

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Cross-Validation Results Analysis
cv_results = pd.DataFrame(svm_model.cv_results_)

# Plot Cross-Validation Accuracy for C and Kernel
plt.figure(figsize=(10, 6))
sns.lineplot(data=cv_results, x='param_C', y='mean_test_score', hue='param_kernel', marker='o')
plt.xlabel('Regularization Parameter (C)')
plt.ylabel('Mean CV Accuracy')
plt.title('Cross-Validation Accuracy for SVM Hyperparameters')
plt.legend(title='Kernel')
plt.grid(True)
plt.show()

In [None]:
# Additional CV Insights: Gamma
if 'param_gamma' in cv_results.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=cv_results, x='param_gamma', y='mean_test_score', hue='param_kernel')
    plt.xlabel('Gamma (RBF Kernel)')
    plt.ylabel('Mean CV Accuracy')
    plt.title('Effect of Gamma on Cross-Validation Accuracy')
    plt.grid(True)
    plt.show()


In [None]:
# Additional CV Insights: Gamma
if 'param_gamma' in cv_results.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=cv_results, x='param_gamma', y='mean_test_score', hue='param_kernel')
    plt.xlabel('Gamma (RBF Kernel)')
    plt.ylabel('Mean CV Accuracy')
    plt.title('Effect of Gamma on Cross-Validation Accuracy')
    plt.grid(True)
    plt.show()


##1st Model: LSTM

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Enhanced LSTM Model
def build_advanced_lstm_model():
    model = Sequential([
        Embedding(input_dim=5000, output_dim=128, input_length=120),
        Bidirectional(LSTM(64, return_sequences=False)),  # Bidirectional LSTM for better context understanding
        Dropout(0.3),  # Add dropout for regularization
        Dense(32, activation='relu'),  # Fully connected layer
        Dropout(0.3),  # Additional dropout for regularization
        Dense(3, activation='softmax')  # Output layer for multi-class classification
    ])
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

# Define Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitors validation loss
    patience=5,          # Stop after 3 epochs without improvement
    restore_best_weights=True  # Restore the best weights
)

model_checkpoint = ModelCheckpoint(
    filepath='best_lstm_model.keras',  # Save the best model to this file
    monitor='val_loss',
    save_best_only=True
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
label_encoder.get_params()

In [None]:
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
label_encoder.classes_

In [None]:
le_name_mapping

In [None]:
# Convert string labels to integer labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_ml)
X_train_seq = tokenizer.texts_to_sequences(X_train_ml)
X_test_seq = tokenizer.texts_to_sequences(X_test_ml)
X_train_pad = pad_sequences(X_train_seq, maxlen=120)
X_test_pad = pad_sequences(X_test_seq, maxlen=120)

In [None]:
# Build and Train the Model
advanced_lstm_model = build_advanced_lstm_model()
history = advanced_lstm_model.fit(
    X_train_pad,
    y_train,
    epochs=100,  # Maximum number of epochs
    batch_size=64,  # Larger batch size for faster training
    validation_split=0.3,
    callbacks=[early_stopping, model_checkpoint]  # Early stopping and checkpointing
)

In [None]:
plot_training_history(history)

In [None]:
y_lstm_pred = np.argmax(advanced_lstm_model.predict(X_test_pad),axis = 1)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_lstm_pred)

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_lstm_pred))

##Greek BERT

In [None]:
#GREEKBERT
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf

# Define the fine-tuned BERT model
def build_finetuned_bert(model_name):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

    # Define inputs
    input_ids = Input(shape=(100,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(100,), dtype=tf.int32, name="attention_mask")

    # Wrap BERT in Lambda layer with explicit output shape
    def bert_fn(inputs):
        return bert_model.bert(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
        )[1]  # [1] extracts the pooled output

    bert_output = Lambda(bert_fn, output_shape=(768,))({"input_ids": input_ids, "attention_mask": attention_mask})

    # Add additional layers for fine-tuning
    dense_output = Dense(128, activation="relu")(bert_output)
    final_output = Dense(3, activation="softmax")(dense_output)

    # Define the complete model
    model = Model(inputs=[input_ids, attention_mask], outputs=final_output)

    # Freeze BERT encoder layers (optional, depending on your use case)
    for layer in bert_model.bert.encoder.layer:
        layer.trainable = False

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model, tokenizer

# Initialize model and tokenizer
bert_model, bert_tokenizer = build_finetuned_bert('nlpaueb/bert-base-greek-uncased-v1')

# Tokenize data
train_encodings = bert_tokenizer(
    list(X_train_transformer.tolist()), truncation=True, padding=True, max_length=100, return_tensors="tf"
)
test_encodings = bert_tokenizer(
    list(X_train_transformer.tolist()), truncation=True, padding=True, max_length=100, return_tensors="tf"
)
# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitors validation loss
    patience=5,          # Number of epochs to wait before stopping
    restore_best_weights=True  # Restore the best weights after stopping
)
# Train the model
history_bert = bert_model.fit(
    x={"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    y=y_train,
    epochs=500,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)


In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    # Extract data from the history object
    epochs = range(1, len(history.history['loss']) + 1)

    # Plot Loss
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['loss'], label='Training Loss', color='blue')
    plt.plot(epochs, history.history['val_loss'], label='Validation Loss', color='orange')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['accuracy'], label='Training Accuracy', color='blue')
    plt.plot(epochs, history.history['val_accuracy'], label='Validation Accuracy', color='orange')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()

    # Show plots
    plt.tight_layout()
    plt.show()


In [None]:
# Call the function with the history object
plot_training_history(history_bert)

In [None]:
test_encodings = bert_tokenizer(
    list(X_test_transformer.tolist()), truncation=True, padding=True, max_length=100, return_tensors="tf"
)

In [None]:
 bert_pred_train = np.argmax(bert_model.predict(
    x={"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]}), axis = 1)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, bert_pred_train)

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, bert_pred_train))

##Zero-shot OpenAI

In [None]:
import openai
from openai import OpenAI

def analyze_greek_sentiment(text: str) -> str:
    """
    Analyze the sentiment of a Greek text using OpenAI's GPT model.

    Parameters:
        text (str): The Greek text to analyze.
        api_key (str): Your OpenAI API key.

    Returns:
        str: A string with the sentiment classification (e.g., Positive, Negative, or Neutral)
             and a brief explanation.
    """
    # Set the API key for the session.
    client = OpenAI(
    api_key=""
    )


    # Define a system prompt to instruct the model about its role.
    system_message = "You are a sentiment analysis assistant specialized in Greek language text."

    # Define the user message that includes the Greek text.
    user_message = (
        f"Analyze the sentiment of the following Greek text. "
        f"Classify it as Positive, Negative, or Neutral and return only the sentiment, nothing more.\n\n"
        f"Text: {text}"
    )

    try:
        # Call the ChatCompletion endpoint with a low temperature for deterministic output.
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            temperature=0  # Lower temperature gives more deterministic results.
        )

        # Extract and return the sentiment analysis result.
        result = response.choices[0].message.content
        return result

    except Exception as e:
        # In case of an error, return the error message.
        return f"An error occurred: {e}"




In [None]:
gpt_predictions = X_train_transformer.map(lambda x : analyze_greek_sentiment(x) )

In [None]:
gpt_pred_train = gpt_predictions.map({'Negative': 0, 'Neutral': 1, 'Positive': 2}).values

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_train, gpt_pred_train))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_train, gpt_pred_train)

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
gpt_predictions_test = X_test_transformer.map(lambda x : analyze_greek_sentiment(x) )

In [None]:
gpt_pred_test = gpt_predictions_test.map({'Negative': 0, 'Neutral': 1, 'Positive': 2}).values

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, gpt_pred_test))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, gpt_pred_test)

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
openai.__version__

In [None]:
# Example usage:
if __name__ == "__main__":
    greek_text = "Αυτό το προϊόν είναι εξαιρετικό και με έκανε πολύ ευχαριστημένο!"
    sentiment_result = analyze_greek_sentiment(greek_text)
    print("Sentiment Analysis Result:")
    print(sentiment_result)

In [None]:
sentiment_result.choices[0].message.content