In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np

In [None]:
# Download necessary NLTK data (run this once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Task 1: Data Collection & Preprocessing

In [None]:
def load_and_preprocess_data(file_path):
    """Loads data, cleans text, and prepares it for modeling."""
    try:
        df = pd.read_csv(file_path)
        # Assuming your dataset has a column named 'review_text' and 'sentiment' (adjust as needed)
        df = df[['review_text', 'sentiment']].dropna()
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

    # Text Cleaning
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = ' '.join([lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words])
        return text

    df['cleaned_text'] = df['review_text'].apply(clean_text)

    # Assuming your 'sentiment' column needs encoding (e.g., categorical to numerical)
    # Adjust this based on your actual 'sentiment' column values
    sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
    df['sentiment_encoded'] = df['sentiment'].map(sentiment_mapping).fillna(0) # Fill NaN with neutral

    return df[['cleaned_text', 'sentiment_encoded']]

## Task 2: 
### Sentiment Analysis - Traditional Models

In [None]:
def train_and_evaluate_traditional(df, model_type='logistic_regression'):
    """Trains and evaluates a traditional sentiment analysis model."""
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment_encoded'], test_size=0.2, random_state=42)

    # Vectorization using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    if model_type == 'logistic_regression':
        model = LogisticRegression(random_state=42)
    elif model_type == 'naive_bayes':
        model = MultinomialNB()
    else:
        raise ValueError(f"Invalid model_type: {model_type}. Choose 'logistic_regression' or 'naive_bayes'.")

    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    print(f"--- {model_type.capitalize()} Model Evaluation ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'{model_type.capitalize()} Confusion Matrix')
    plt.show()

### Sentiment Analysis - Deep Learning Models (LSTM)

In [None]:
def train_and_evaluate_lstm(df):
    """Trains and evaluates an LSTM sentiment analysis model."""
    X_train_text, X_test_text, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment_encoded'], test_size=0.2, random_state=42)

    # Tokenization
    tokenizer = Tokenizer(num_words=10000, oov_token="<unk>") # Adjust num_words as needed
    tokenizer.fit_on_texts(X_train_text)

    X_train_seq = tokenizer.texts_to_sequences(X_train_text)
    X_test_seq = tokenizer.texts_to_sequences(X_test_text)

    # Padding
    maxlen = 100 # Adjust maxlen as needed
    X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen, padding='post', truncating='post')
    X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen, padding='post', truncating='post')

    # Model Building
    embedding_dim = 16 # Adjust embedding dimension as needed
    lstm_units = 32 # Adjust LSTM units as needed

    model = Sequential([
        Embedding(input_dim=10000, output_dim=embedding_dim, input_length=maxlen),
        LSTM(units=lstm_units),
        Dense(units=3, activation='softmax') # 3 output units for negative, neutral, positive
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("--- LSTM Model Summary ---")
    model.summary()

    # Training
    epochs = 5 # Adjust number of epochs as needed
    batch_size = 32 # Adjust batch size as needed
    history = model.fit(X_train_padded, y_train + 1, # Adjust labels to start from 0 for sparse_categorical_crossentropy
                        epochs=epochs, batch_size=batch_size,
                        validation_data=(X_test_padded, y_test + 1))

    # Evaluation
    y_pred_probs = model.predict(X_test_padded)
    y_pred = np.argmax(y_pred_probs, axis=1) - 1 # Convert back to original labels

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    print("\n--- LSTM Model Evaluation ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('LSTM Confusion Matrix')
    plt.show()

    # Plot training history
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.legend()
    plt.title('LSTM Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.legend()
    plt.title('LSTM Loss')
    plt.tight_layout()
    plt.show()


### Task 2: Sentiment Analysis - Deep Learning Models (BERT)

In [None]:
def train_and_evaluate_bert(df, model_name='bert-base-uncased'):
    """Trains and evaluates a BERT sentiment analysis model."""
    X_train_text, X_test_text, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment_encoded'], test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: negative, neutral, positive

    train_encodings = tokenizer(list(X_train_text), truncation=True, padding=True)
    test_encodings = tokenizer(list(X_test_text), truncation=True, padding=True)

    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        np.array(y_train + 1) # Adjust labels to start from 0
    )).shuffle(len(X_train_text)).batch(8) # Adjust batch size

    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        np.array(y_test + 1)
    )).batch(8)

    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    def train_step(batch):
        with tf.GradientTape() as tape:
            outputs = model(batch[0])
            loss = loss_fn(batch[1], outputs.logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        metric.update_state(batch[1], outputs.logits)
        return loss

    def evaluate_step(batch):
        outputs = model(batch[0], training=False)
        loss = loss_fn(batch[1], outputs.logits)
        metric.update_state(batch[1], outputs.logits)
        return loss

    epochs = 2 # Adjust number of epochs
    print("--- BERT Model Training ---")
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        for step, batch in enumerate(train_dataset):
            loss = train_step(batch)
            if step % 100 == 0:
                print(f"  Step {step}, Loss: {loss.numpy():.4f}, Accuracy: {metric.result().numpy():.4f}")
        print(f"  Epoch {epoch + 1} Training Accuracy: {metric.result().numpy():.4f}")
        metric.reset_states()

        for batch in test_dataset:
            loss = evaluate_step(batch)
        print(f"  Epoch {epoch + 1} Evaluation Accuracy: {metric.result().numpy():.4f}")
        metric.reset_states()

    # Evaluation
    y_pred_logits = model.predict(test_dataset).logits
    y_pred = np.argmax(y_pred_logits, axis=1) - 1

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    print("\n--- BERT Model Evaluation ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('BERT Confusion Matrix')
    plt.show()

if __name__ == "__main__":
    # Replace 'your_dataset.csv' with the actual path to your dataset file
    file_path = 'your_dataset.csv'
    processed_df = load_and_preprocess_data(file_path)

    if processed_df is not None:
        print("--- Processed Data Sample ---")
        print(processed_df.head())

        # Train and evaluate traditional models
        train_and_evaluate_traditional(processed_df, model_type='logistic_regression')
        train_and_evaluate_traditional(processed_df, model_type='naive_bayes')

        # Train and evaluate LSTM model
        train_and_evaluate_lstm(processed_df)

        # Train and evaluate BERT model
        train_and_evaluate_bert(processed_df)