In [None]:
# Rewritten sentiment-analysis-model.ipynb

# ---
# ## Sentiment Analysis with Deep Learning and Baseline Models
#
# This notebook explores sentiment analysis using both a deep learning approach with a Convolutional Neural Network (CNN) and a traditional machine learning model (Random Forest) as a baseline.
#
# ### Section 1: Data Loading and Preprocessing
#
# First, we'll load the dataset and preprocess the text data.
# We'll use NLTK for text cleaning, a common and powerful library for NLP tasks.
# ---

# #### Import Libraries
# Inspired by: Course_1_-_Quickstart.ipynb for the basic imports (pandas, numpy, matplotlib)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm

# Download NLTK data
nltk.download('stopwords')


# ---
# #### Task 1: Loading the Data
#
# We'll load the data from the text file into a pandas DataFrame.
# Inspired by: Course_1_-_Quickstart.ipynb for general data loading with pandas.
# ---

def load_sentiment_data(filepath):
    """
    Loads sentiment data from a file into a pandas DataFrame.
    """
    with open(filepath, 'r', encoding='latin-1') as f:
        lines = f.readlines()
    data = [line.strip().split('@') for line in lines]
    return pd.DataFrame(data, columns=['text', 'sentiment'])

# Load the dataset
data_df = load_sentiment_data('Sentiment-Analysis/Sentences_75Agree_sample.txt')
print("Dataset loaded successfully:")
print(data_df.head())


# ---
# #### Task 2: Text Preprocessing with NLTK
#
# This task is inspired by the techniques shown in `LabText_1.ipynb`.
# We will clean the text by removing punctuation, converting to lowercase, stemming, and removing stopwords.
# ---

def preprocess_text(text):
    """
    Cleans and preprocesses a single text entry.
    """
    # Inspired by: LabText_1.ipynb for using NLTK for stemming and stopwords
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    # Tokenize the text
    tokens = re.split('\\W+', text)
    # Remove stopwords and perform stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return " ".join(processed_tokens)

print("\nPreprocessing text data...")
data_df['cleaned_text'] = data_df['text'].apply(preprocess_text)
print("Text preprocessing complete.")
print(data_df.head())


# ---
# ### Section 2: Exploratory Data Analysis
#
# Let's explore the data to understand the distribution of sentiments and sentence lengths.
# ---

# #### Task 3: Data Visualization
#
# We will visualize the sentiment distribution and the length of the sentences.
# Inspired by: Course_1_-_Quickstart.ipynb for creating visualizations with matplotlib and seaborn.
# ---

# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=data_df)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Sentence Length Distribution
data_df['text_length'] = data_df['cleaned_text'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(data_df['text_length'], bins=50)
plt.title('Sentence Length Distribution')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.show()


# ---
# ### Section 3: Building a Sentiment Analysis Model with a CNN
#
# We'll use a 1D Convolutional Neural Network (CNN) for this task.
# This technique is adapted from `Course_4_-_CNNs.ipynb.ipynb`, which uses CNNs for image analysis. Here, we apply the same concept to text data.
# ---

# #### Task 4: Prepare Data for the CNN Model
#
# We need to tokenize the text and convert it into sequences that our model can understand.
# Inspired by: Course_3_-_Neural_Nets.ipynb for data preparation for a Keras model.
# ---

# Label encoding
data_df['sentiment_encoded'] = data_df['sentiment'].apply(lambda x: 1 if x == 'positive' else (0 if x == 'negative' else 2))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data_df['cleaned_text'], data_df['sentiment_encoded'], test_size=0.2, random_state=42)

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_seq_length = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# One-hot encode the labels
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=3)


# ---
# #### Task 5: Build and Train the CNN Model
#
# Here we define and train our 1D CNN model.
# The model architecture is inspired by the CNN model in `Course_4_-_CNNs.ipynb.ipynb`.
# ---

def create_cnn_model(vocab_size, max_seq_length):
    """
    Creates a 1D CNN model for sentiment analysis.
    """
    model = Sequential([
        Embedding(vocab_size, 128, input_length=max_seq_length),
        # Inspired by: Course_4_-_CNNs.ipynb.ipynb for using Convolutional layers
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        # Inspired by: Course_3_-_Neural_Nets.ipynb for Dense and Dropout layers
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train the model
cnn_model = create_cnn_model(vocab_size, max_seq_length)
print("\nTraining the CNN model...")
history = cnn_model.fit(X_train_padded, y_train_cat, epochs=5, validation_split=0.2, batch_size=32, verbose=1)
print("CNN model training complete.")

# ---
# #### Task 6: Evaluate the CNN Model
# Inspired by: Course_3_-_Neural_Nets.ipynb and Course_2_-_RF_KNN.ipynb for model evaluation techniques.
# ---

# Evaluate on the test set
loss, accuracy = cnn_model.evaluate(X_test_padded, y_test_cat, verbose=0)
print(f'\nCNN Model Test Accuracy: {accuracy:.4f}')

# Predictions
y_pred_probs = cnn_model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Classification Report
print("\nCNN Model Classification Report:")
print(classification_report(y_test, y_pred_classes, target_names=['negative', 'positive', 'neutral']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'positive', 'neutral'], yticklabels=['negative', 'positive', 'neutral'])
plt.title('CNN Model Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# ---
# ### Section 4: Baseline Model with Random Forest
#
# To better understand the performance of our CNN, we will compare it with a Random Forest classifier.
# This entire section is inspired by `Course_2_-_RF_KNN.ipynb`.
# ---

# #### Task 7: Prepare Data and Train the Random Forest Model
# ---

# Inspired by: Course_2_-_RF_KNN.ipynb for using scikit-learn for feature extraction and model training.
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
print("\nTraining the Random Forest model...")
rf_model.fit(X_train_vec, y_train)
print("Random Forest model training complete.")

# ---
# #### Task 8: Evaluate the Random Forest Model
# Inspired by: Course_2_-_RF_KNN.ipynb for evaluating a scikit-learn model.
# ---
y_pred_rf = rf_model.predict(X_test_vec)

print("\nRandom Forest Model Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['negative', 'positive', 'neutral']))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', xticklabels=['negative', 'positive', 'neutral'], yticklabels=['negative', 'positive', 'neutral'])
plt.title('Random Forest Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
