In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset from CSV
df = pd.read_csv("YoutubeCommentsDataSet.csv")

In [2]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [3]:
# Preprocess text (lowercase, remove special characters)
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

df["clean_comment"] = df["Comment"].apply(preprocess_text)

df.head()

Unnamed: 0,Comment,Sentiment,clean_comment
0,lets not forget that apple pay in 2014 require...,neutral,lets not forget that apple pay in required a ...
1,here in nz 50 of retailers don’t even have con...,negative,here in nz of retailers dont even have contac...
2,i will forever acknowledge this channel with t...,positive,i will forever acknowledge this channel with t...
3,whenever i go to a place that doesn’t take app...,negative,whenever i go to a place that doesnt take appl...
4,apple pay is so convenient secure and easy to ...,positive,apple pay is so convenient secure and easy to ...


In [10]:
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Balance the dataset
df_negative = df[df["Sentiment"] == "negative"]
df_neutral = df[df["Sentiment"] == "neutral"]
df_positive = df[df["Sentiment"] == "positive"]

In [12]:
# Determine the target sample size (match the largest class)
target_size = max(len(df_negative), len(df_neutral), len(df_positive))

# Upsample minority classes
df_negative_upsampled = resample(df_negative, replace=True, n_samples=target_size, random_state=42)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=target_size, random_state=42)
df_positive_upsampled = resample(df_positive, replace=True, n_samples=target_size, random_state=42)

# Combine all balanced classes
df_balanced = pd.concat([df_negative_upsampled, df_neutral_upsampled, df_positive_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
# Convert text to numerical representation using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_comment"])
y = df["Sentiment"]

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)  # Needed because TF-IDF is sparse
# Logistic Regression works better when input data is scaled.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model (Logistic Regression)
model = LogisticRegression(solver='saga', max_iter=3500)
model.fit(X_train_scaled, y_train)

In [16]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.70
              precision    recall  f1-score   support

    negative       0.50      0.34      0.41       475
     neutral       0.56      0.48      0.52       937
    positive       0.77      0.86      0.81      2270

    accuracy                           0.70      3682
   macro avg       0.61      0.56      0.58      3682
weighted avg       0.68      0.70      0.69      3682



In [17]:
import joblib

# Save the trained model
joblib.dump(model, "sentiment_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear')
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

Logistic Regression Accuracy: 0.69
              precision    recall  f1-score   support

    negative       0.46      0.31      0.37       475
     neutral       0.53      0.53      0.53       937
    positive       0.78      0.83      0.81      2270

    accuracy                           0.69      3682
   macro avg       0.59      0.56      0.57      3682
weighted avg       0.68      0.69      0.68      3682

--------------------------------------------------
Random Forest Accuracy: 0.69
              precision    recall  f1-score   support

    negative       0.92      0.02      0.05       475
     neutral       0.74      0.37      0.49       937
    positive       0.68      0.96      0.79      2270

    accuracy                           0.69      3682
   macro avg       0.78      0.45      0.44      3682
weighted avg       0.72      0.69      0.62      3682

--------------------------------------------------
SVM Accuracy: 0.65
              precision    recall  f1-score   support