In [None]:
# =========================================================
# YouTube Comment Sentiment Analyzer (Machine Learning)
# =========================================================

# 1️⃣ Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
nltk.download('wordnet')

# =========================================================
# 2️⃣ Load Dataset and Keep Relevant Columns
# =========================================================
df = pd.read_csv('youtube_comments_cleaned.csv')
df = df[['CommentText', 'Sentiment']]
df = df.rename(columns={'CommentText': 'comment_text', 'Sentiment': 'sentiment'})

# Drop rows with missing values
df = df.dropna(subset=['comment_text','sentiment'])

# =========================================================
# 3️⃣ Text Preprocessing
# =========================================================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess(text):
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['processed'] = df['comment_text'].apply(clean_text).apply(preprocess)

# =========================================================
# 4️⃣ Encode Labels
# =========================================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

# =========================================================
# 5️⃣ Split Dataset
# =========================================================
X = df['processed']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================================================
# 6️⃣ TF-IDF Vectorization
# =========================================================
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# =========================================================
# 7️⃣ Train Machine Learning Model
# Option 1: Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Option 2 (alternative): Naive Bayes
# clf = MultinomialNB()
# clf.fit(X_train_vec, y_train)

# =========================================================
# 8️⃣ Evaluate Model
# =========================================================
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# =========================================================
# 9️⃣ Sample Prediction
# =========================================================
def predict_sentiment(comment):
    comment_proc = preprocess(clean_text(comment))
    comment_vec = vectorizer.transform([comment_proc])
    pred_label = clf.predict(comment_vec)[0]
    return le.inverse_transform([pred_label])[0]

sample_comment = "I love this video! Very helpful."
print("Sample Comment Prediction:", predict_sentiment(sample_comment))

# =========================================================
# 10️⃣ Optional: Visualize Sentiment Distribution
# =========================================================
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='sentiment', data=df)
plt.title('Original Sentiment Distribution')
plt.show()


In [None]:
sample_comment = "I hate this video."
print("Sample Comment Prediction:", predict_sentiment(sample_comment))

In [None]:
import joblib

joblib.dump(clf, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")
