In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import save_model

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 2. Load Dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/YoutubeCommentsDataSet.csv')  # Adjust file name
print("Dataset sample:")
print(df.head())
print("\nClass distribution:")
print(df['Sentiment'].value_counts())

Dataset sample:
                                             Comment Sentiment
0  lets not forget that apple pay in 2014 require...   neutral
1  here in nz 50 of retailers don’t even have con...  negative
2  i will forever acknowledge this channel with t...  positive
3  whenever i go to a place that doesn’t take app...  negative
4  apple pay is so convenient secure and easy to ...  positive

Class distribution:
Sentiment
positive    11432
neutral      4638
negative     2338
Name: count, dtype: int64


In [6]:
# 3. Preprocess Text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_comment'] = df['Comment'].apply(clean_text)


In [7]:
# 4. Map Sentiments to Numeric Labels
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df = df[df['Sentiment'].isin(label_map)]
df['label'] = df['Sentiment'].map(label_map)

In [8]:
# 5. Train–Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

In [9]:
# 6. Tokenize & Pad Sequences
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Convert labels to categorical
y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

In [10]:
# 7. Build LSTM Sentiment Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(128),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [11]:
# 8. Train the Model
history = model.fit(
    X_train_pad, y_train_cat,
    validation_data=(X_test_pad, y_test_cat),
    epochs=51,
    batch_size=64
)


Epoch 1/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.6097 - loss: 0.9291 - val_accuracy: 0.6211 - val_loss: 0.9023
Epoch 2/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6216 - loss: 0.9063 - val_accuracy: 0.6317 - val_loss: 0.9094
Epoch 3/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6341 - loss: 0.8962 - val_accuracy: 0.6247 - val_loss: 0.9001
Epoch 4/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.6302 - loss: 0.8946 - val_accuracy: 0.6249 - val_loss: 0.9034
Epoch 5/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.6448 - loss: 0.8764 - val_accuracy: 0.6222 - val_loss: 0.9086
Epoch 6/51
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6440 - loss: 0.8752 - val_accuracy: 0.6222 - val_loss: 0.9262
Epoch 7/51
[1m231/23

In [12]:
# 9. Evaluate Model
loss, accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f"\nTest Accuracy: {accuracy:.4f}")

[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7129 - loss: 2.8102

Test Accuracy: 0.7042


In [18]:
# 10. Prediction Function for Custom Text
def predict_sentiment(text):
    text_clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(pad, verbose=0)

    classes = ["Negative", "Neutral", "Positive"]
    sentiment = classes[np.argmax(pred)]
    confidence = float(np.max(pred))

    return {"text": text, "sentiment": sentiment, "confidence": confidence}

# 🔹 Interactive Comment Input
while True:
    user_input = input("\nEnter a comment (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break
    result = predict_sentiment(user_input)
    print(f"Comment: {result['text']}")
    print(f"Predicted Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})")



Enter a comment (or type 'exit' to quit): sonu
Comment: sonu
Predicted Sentiment: Neutral (Confidence: 0.81)

Enter a comment (or type 'exit' to quit): exit


In [14]:
# 🔹 Test with Custom Inputs
print(predict_sentiment("I love this video! It's amazing."))
print(predict_sentiment("This is the worst video ever."))
print(predict_sentiment("The video is okay, not too bad but not great either."))

{'text': "I love this video! It's amazing.", 'sentiment': 'Positive', 'confidence': 0.9999997615814209}
{'text': 'This is the worst video ever.', 'sentiment': 'Negative', 'confidence': 0.9999986886978149}
{'text': 'The video is okay, not too bad but not great either.', 'sentiment': 'Negative', 'confidence': 0.965998113155365}


In [19]:
# 11. Save Model and Tokenizer for Django
model_dir = 'sentiment_model'
os.makedirs(model_dir, exist_ok=True)

# Save model
model_path = os.path.join(model_dir, 'sentiment_lstm.h5')
save_model(model, model_path)
print(f"Saved model to {model_path}")

# Save tokenizer
import pickle
tokenizer_path = os.path.join(model_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Saved tokenizer to {tokenizer_path}")



Saved model to sentiment_model/sentiment_lstm.h5
Saved tokenizer to sentiment_model/tokenizer.pkl
