

"""
Text-Based Sentiment Analysis using CNN (Adaptation)
------------------------------------------------------------
This script performs sentiment analysis using CNN on YouTube comments dataset.
1. Load dataset from the provided notebook file
2. Preprocess text data
3. Tokenize and pad sequences
4. Split data into training and testing sets
5. Build and compile a CNN model
6. Train the model with validation
7. Evaluate model performance
8. Test on the structured test set
"""


In [1]:


import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# ----------------------------
# 1. Load and Preprocess Data
# ----------------------------

In [2]:
# Load dataset from the provided notebook file
dataset_path = '/kaggle/input/youtube-comments-dataset/YoutubeCommentsDataSet.csv'
df = pd.read_csv(dataset_path)  # Adjust if the file format is different

# Replace sentiment labels with numerical values
df['Sentiment'] = df['Sentiment'].replace({'negative': 0, 'neutral': 1, 'positive': 2})

# Extract text and labels (assuming 'comment' and 'sentiment' columns exist)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

df['cleaned_comment'] = df['Comment'].astype(str).apply(clean_text)
texts = df['cleaned_comment'].tolist()
labels = df['Sentiment'].astype(int).tolist()
# Tokenization
max_vocab_size = 10000
max_seq_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Convert labels to NumPy array
labels = np.array(labels)

  df['Sentiment'] = df['Sentiment'].replace({'negative': 0, 'neutral': 1, 'positive': 2})


# ----------------------------
# 2. Split Data into Train and Test Sets
# ----------------------------

In [3]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)


# ----------------------------
# 3. Build CNN Model
# ----------------------------

In [4]:
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_seq_length),
    Conv1D(filters=32, kernel_size=4, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.4),
    Dense(3, activation='softmax')  # Multi-class classification
])
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()




# ----------------------------
# 4. Train the Model with Validation
# ----------------------------

In [5]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_test, y_test))

Epoch 1/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.6487 - loss: 0.8358 - val_accuracy: 0.7178 - val_loss: 0.6287
Epoch 2/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7966 - loss: 0.5078 - val_accuracy: 0.7515 - val_loss: 0.5871
Epoch 3/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9136 - loss: 0.2630 - val_accuracy: 0.7482 - val_loss: 0.6989
Epoch 4/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9647 - loss: 0.1181 - val_accuracy: 0.7363 - val_loss: 0.8750
Epoch 5/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9845 - loss: 0.0598 - val_accuracy: 0.7423 - val_loss: 1.0538
Epoch 6/10
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9898 - loss: 0.0382 - val_accuracy: 0.7458 - val_loss: 1.2056
Epoch 7/10
[1m461/461[0m 

# ----------------------------
# 5. Evaluate the Model on Test Set
# ----------------------------

In [6]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict on the structured test set
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
# Print predictions alongside actual labels
for i in range(10):  # Display first 10 test samples
    print(f"Comment: {df.iloc[i]['Comment'][:50]}...")
    print(f"Actual Sentiment: {y_test[i]}, Predicted Sentiment: {y_pred[i]}")
    print("-" * 50)

[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7421 - loss: 1.5840
Test Accuracy: 0.7363
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Comment: lets not forget that apple pay in 2014 required a ...
Actual Sentiment: 2, Predicted Sentiment: 2
--------------------------------------------------
Comment: here in nz 50 of retailers don’t even have contact...
Actual Sentiment: 1, Predicted Sentiment: 2
--------------------------------------------------
Comment: i will forever acknowledge this channel with the h...
Actual Sentiment: 2, Predicted Sentiment: 2
--------------------------------------------------
Comment: whenever i go to a place that doesn’t take apple p...
Actual Sentiment: 1, Predicted Sentiment: 1
--------------------------------------------------
Comment: apple pay is so convenient secure and easy to use ...
Actual Sentiment: 2, Predicted Sentiment: 2
--------------------------------------------------
Comm

# ----------------------------
# 6. Predict Sentiment for a New Comment
# ----------------------------

In [7]:

def predict_sentiment(comment):
    cleaned_comment = clean_text(comment)
    sequence = tokenizer.texts_to_sequences([cleaned_comment])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_length, padding='post', truncating='post')
    
    prediction = model.predict(padded_sequence)
    sentiment_class = np.argmax(prediction)
    
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    
    print(f"Comment: {comment}")
    print(f"Predicted Sentiment: {sentiment_map[sentiment_class]}")

# Example Usage
user_comment = "I absolutely love this product! It's cool."
predict_sentiment(user_comment)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step
Comment: I absolutely love this product! It's cool.
Predicted Sentiment: Positive
