In [1]:
import pandas as pd
df=pd.read_csv("GBcomments.csv", on_bad_lines='skip')

In [2]:
df.shape

(718452, 4)

In [3]:
df.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [4]:
df.isna().sum()

video_id         0
comment_text    28
likes            0
replies          0
dtype: int64

In [5]:
df.dropna(subset=['comment_text'],inplace=True)

In [6]:
df.isna().sum()

video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

In [7]:
import re,pdb
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Lowercase
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     pdb.set_trace()
    return ' '.join(tokens)

In [8]:
df['comment_text'] = df['comment_text'].apply(preprocess_text)

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
# Analyze sentiment
df['sentiment'] = df['comment_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [11]:
# Classify sentiment as Positive, Negative, or Neutral
def classify_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [12]:
df.head()

Unnamed: 0,video_id,comment_text,likes,replies,sentiment
0,jt2OHQh0HoQ,accurate call price closer calling x,0,0,0.0
1,jt2OHQh0HoQ,samsung phonen,1,0,0.0
2,jt2OHQh0HoQ,thank gosh place watch without hd speed doesnt...,0,0,0.0624
3,jt2OHQh0HoQ,happened home button iphone x coughcopying sam...,0,0,0.0
4,jt2OHQh0HoQ,power disease care cure keep caring others bes...,0,0,0.891


In [13]:
df['sentiment_label'] = df['sentiment'].apply(classify_sentiment)

In [14]:
df.head()

Unnamed: 0,video_id,comment_text,likes,replies,sentiment,sentiment_label
0,jt2OHQh0HoQ,accurate call price closer calling x,0,0,0.0,Neutral
1,jt2OHQh0HoQ,samsung phonen,1,0,0.0,Neutral
2,jt2OHQh0HoQ,thank gosh place watch without hd speed doesnt...,0,0,0.0624,Positive
3,jt2OHQh0HoQ,happened home button iphone x coughcopying sam...,0,0,0.0,Neutral
4,jt2OHQh0HoQ,power disease care cure keep caring others bes...,0,0,0.891,Positive


In [15]:
df.sentiment_label.value_counts()

sentiment_label
Positive    344384
Neutral     245497
Negative    128543
Name: count, dtype: int64

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['comment_text'])

In [17]:
X

<718424x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 3351518 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Transform sentiment_label to integers
df['sentiment_label_encoded'] = label_encoder.fit_transform(df['sentiment_label'])

# Check the mapping
df[['sentiment_label_encoded','sentiment_label']]


Unnamed: 0,sentiment_label_encoded,sentiment_label
0,1,Neutral
1,1,Neutral
2,2,Positive
3,1,Neutral
4,2,Positive
...,...,...
718447,1,Neutral
718448,2,Positive
718449,1,Neutral
718450,1,Neutral


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment_label'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment_label_encoded'], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
y_pred

array([2, 2, 0, ..., 2, 1, 1])

In [26]:
def predict_sentiment(new_text):
    # Step 1: Preprocess the text
    cleaned_text = preprocess_text(new_text)
    
    # Step 2: Convert text to TF-IDF vector using trained vectorizer
    text_vector = vectorizer.transform([cleaned_text])
    
    # Step 3: Predict sentiment using trained model
    predicted_label = model.predict(text_vector)[0]  # Ensure this returns an integer
    
    # Step 4: Convert numeric label back to original sentiment
    sentiment = label_encoder.inverse_transform([predicted_label])[0]
    
    return sentiment

In [27]:
# Example tests
new_comments = [
    "This product is absolutely amazing! I love it.",
    "Worst experience ever, I regret buying this.",
    "It's okay, not too bad, not too great either."
]

In [29]:
for comment in new_comments:
    print(f"Comment: {comment}\nPredicted Sentiment: {predict_sentiment(comment)}\n")

Comment: This product is absolutely amazing! I love it.
Predicted Sentiment: Positive

Comment: Worst experience ever, I regret buying this.
Predicted Sentiment: Negative

Comment: It's okay, not too bad, not too great either.
Predicted Sentiment: Positive

