In [1]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
dataset = pd.read_csv('/content/reddit_preprocessing.csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

In [4]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [5]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [6]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

In [7]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [28]:
best_model = lgb.LGBMClassifier(
    learning_rate=0.078,
    n_estimators=658,
    max_depth=15,
    num_leaves=24,
    min_child_samples=10,
    colsample_bytree=0.8067,
    subsample=0.9808,
    reg_alpha=0.0057,
    reg_lambda=0.018,
    class_weight='balanced',
    objective='multiclass',
    num_class=3,
    random_state=42
)


In [29]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.590806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 156191
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 8594
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [31]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)



In [32]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.9516860445293055

In [33]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.95      0.94      0.94      6601
           0       0.91      0.99      0.95     10134
           1       0.99      0.93      0.96     12594

    accuracy                           0.95     29329
   macro avg       0.95      0.95      0.95     29329
weighted avg       0.95      0.95      0.95     29329



In [34]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)



In [35]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.8784944770216828

In [36]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.82      0.79      0.81      1647
           0       0.86      0.98      0.91      2510
           1       0.93      0.85      0.88      3176

    accuracy                           0.88      7333
   macro avg       0.87      0.87      0.87      7333
weighted avg       0.88      0.88      0.88      7333



In [40]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)
    print(prediction_proba)


    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    print("argsort",np.argsort(prediction_proba))
    sentiment_proba = np.max(prediction_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }



In [44]:
# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."


In [47]:
result = predict_sentiment(comment5, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

[[0.84421501 0.13359692 0.02218806]]
argsort [[2 1 0]]
Predicted Sentiment: -1, Confidence: 0.8442150133551525




In [48]:
from sklearn.linear_model import LogisticRegression

best_model = LogisticRegression(
    penalty="l1",
    C=2.2619091941442075,
    solver="liblinear",
    max_iter=352,
    class_weight="balanced",      # Handles imbalance in target classes
    multi_class="auto",           # Auto-detects binary or multi-class
    random_state=42
)

In [50]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)



In [51]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)

In [52]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.9247161512496164

In [53]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.90      0.88      0.89      6601
           0       0.90      0.98      0.94     10134
           1       0.96      0.91      0.93     12594

    accuracy                           0.92     29329
   macro avg       0.92      0.92      0.92     29329
weighted avg       0.93      0.92      0.92     29329



In [54]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)

In [55]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.8798581753716078

In [56]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.84      0.78      0.81      1647
           0       0.85      0.97      0.91      2510
           1       0.93      0.86      0.89      3176

    accuracy                           0.88      7333
   macro avg       0.87      0.87      0.87      7333
weighted avg       0.88      0.88      0.88      7333



In [64]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)
    print(prediction_proba)


    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    print("senti class",sentiment_class)
    sentiment_proba = np.max(prediction_proba)
    print("senti prob",sentiment_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }



In [65]:
# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."


In [67]:
result = predict_sentiment(comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

[0]
[[0.09739367 0.86082951 0.04177682]]
senti class 1
senti prob 0.8608295148236361
Predicted Sentiment: 0, Confidence: 0.8608295148236361


In [68]:
comment11 = "This was a complete waste of time. Nothing made sense."
comment12 = "I regret clicking on this video. Very disappointing."
comment13 = "Terrible editing, bad audio, and confusing flow throughout."
comment14 = "Excellent presentation and editing — love your work!"
comment15 = "This helped me so much. You earned a subscriber today!"
comment16 = "Good attempt, but could’ve been better."
comment17 = "The content was average — not too good, not too bad."
comment18 = "Basic video. Got the job done but didn’t impress me."


In [70]:
result = predict_sentiment(comment13, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

[-1]
[[9.95372434e-01 4.61981255e-03 7.75383293e-06]]
senti class 0
senti prob 0.9953724336168063
Predicted Sentiment: -1, Confidence: 0.9953724336168063
