In [71]:
import pandas as pd
import numpy as np
!pip install nltk
!pip install vaderSentiment
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



In [72]:
df = pd.read_csv("/content/twitter.csv")
df.head()


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [73]:
df.tail()


Unnamed: 0,id,label,tweet
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."
31961,31962,0,thank you @user for you follow


In [74]:
def clean_text(tweet):
    text = tweet.lower()  # sab letters chhote (lowercase) mein
    text = re.sub(r'<.*?>', '', text)  # HTML tags hatao
    text = re.sub(r'\([^)]*\)', '', text)  # ( ) ke andar ka text hatao
    text = re.sub(r'\[[^\]]*\]', '', text)  # [ ] ke andar ka text hatao
    text = re.sub(r'[^\w\s]', '', text)  # punctuation hatao
    text = re.sub(r'\s+', ' ', text)  # extra spaces hatao
    return text.strip()  # shuru aur end ke spaces hatao

In [75]:
df['cleaned_review'] = df['tweet'].apply(clean_text)

In [76]:
df.head()

Unnamed: 0,id,label,tweet,cleaned_review
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so ...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in urð...
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [77]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()                    # split sentence into words
    words = [w for w in words if w not in stop_words]  # keep only useful words
    return ' '.join(words)                  # join them back to sentence

# Apply to the column
df['no_stopwords'] = df['cleaned_review'].apply(remove_stopwords)


In [78]:
df.head()

Unnamed: 0,id,label,tweet,cleaned_review,no_stopwords
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so ...,user father dysfunctional selfish drags kids d...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...,user user thanks lyft credit cant use cause do...
2,3,0,bihday your majesty,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in urð...,model love u take u time urð ðððð ððð
4,5,0,factsguide: society now #motivation,factsguide society now motivation,factsguide society motivation


In [79]:
df['tokenized_review'] = df['cleaned_review'].apply(nltk.word_tokenize)

In [80]:
df.tokenized_review

Unnamed: 0,tokenized_review
0,"[user, when, a, father, is, dysfunctional, and..."
1,"[user, user, thanks, for, lyft, credit, i, can..."
2,"[bihday, your, majesty]"
3,"[model, i, love, u, take, with, u, all, the, t..."
4,"[factsguide, society, now, motivation]"
...,...
31957,"[ate, user, isz, that, youuuðððððððððâï]"
31958,"[to, see, nina, turner, on, the, airwaves, try..."
31959,"[listening, to, sad, songs, on, a, monday, mor..."
31960,"[user, sikh, temple, vandalised, in, in, calga..."


In [81]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['lemmatized_review'] = df['tokenized_review'].apply(lemmatize_tokens)

In [82]:
df.head()

Unnamed: 0,id,label,tweet,cleaned_review,no_stopwords,tokenized_review,lemmatized_review
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so ...,user father dysfunctional selfish drags kids d...,"[user, when, a, father, is, dysfunctional, and...","[user, when, a, father, is, dysfunctional, and..."
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...,user user thanks lyft credit cant use cause do...,"[user, user, thanks, for, lyft, credit, i, can...","[user, user, thanks, for, lyft, credit, i, can..."
2,3,0,bihday your majesty,bihday your majesty,bihday majesty,"[bihday, your, majesty]","[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in urð...,model love u take u time urð ðððð ððð,"[model, i, love, u, take, with, u, all, the, t...","[model, i, love, u, take, with, u, all, the, t..."
4,5,0,factsguide: society now #motivation,factsguide society now motivation,factsguide society motivation,"[factsguide, society, now, motivation]","[factsguide, society, now, motivation]"


In [83]:
df['lemmatized_text'] = df['lemmatized_review'].apply(lambda x: ' '.join(x))


In [84]:
sia = SentimentIntensityAnalyzer()

In [85]:
df[['neg', 'neu', 'pos', 'compound']] = df['lemmatized_text'].apply(lambda x: pd.Series(sia.polarity_scores(x)))

In [86]:
df

Unnamed: 0,id,label,tweet,cleaned_review,no_stopwords,tokenized_review,lemmatized_review,lemmatized_text,neg,neu,pos,compound
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so ...,user father dysfunctional selfish drags kids d...,"[user, when, a, father, is, dysfunctional, and...","[user, when, a, father, is, dysfunctional, and...",user when a father is dysfunctional and is so ...,0.374,0.626,0.000,-0.8383
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...,user user thanks lyft credit cant use cause do...,"[user, user, thanks, for, lyft, credit, i, can...","[user, user, thanks, for, lyft, credit, i, can...",user user thanks for lyft credit i cant use ca...,0.000,0.756,0.244,0.6705
2,3,0,bihday your majesty,bihday your majesty,bihday majesty,"[bihday, your, majesty]","[bihday, your, majesty]",bihday your majesty,0.000,1.000,0.000,0.0000
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in urð...,model love u take u time urð ðððð ððð,"[model, i, love, u, take, with, u, all, the, t...","[model, i, love, u, take, with, u, all, the, t...",model i love u take with u all the time in urð...,0.000,0.756,0.244,0.6369
4,5,0,factsguide: society now #motivation,factsguide society now motivation,factsguide society motivation,"[factsguide, society, now, motivation]","[factsguide, society, now, motivation]",factsguide society now motivation,0.000,0.556,0.444,0.3400
...,...,...,...,...,...,...,...,...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,ate user isz that youuuðððððððððâï,ate user isz youuuðððððððððâï,"[ate, user, isz, that, youuuðððððððððâï]","[ate, user, isz, that, youuuðððððððððâï]",ate user isz that youuuðððððððððâï,0.000,1.000,0.000,0.0000
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to w...,see nina turner airwaves trying wrap mantle ge...,"[to, see, nina, turner, on, the, airwaves, try...","[to, see, nina, turner, on, the, airwave, tryi...",to see nina turner on the airwave trying to wr...,0.106,0.685,0.209,0.4588
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...,listening sad songs monday morning otw work sad,"[listening, to, sad, songs, on, a, monday, mor...","[listening, to, sad, song, on, a, monday, morn...",listening to sad song on a monday morning otw ...,0.360,0.640,0.000,-0.7351
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",user sikh temple vandalised in in calgary wso ...,user sikh temple vandalised calgary wso condem...,"[user, sikh, temple, vandalised, in, in, calga...","[user, sikh, temple, vandalised, in, in, calga...",user sikh temple vandalised in in calgary wso ...,0.268,0.732,0.000,-0.5106


In [87]:
def classify_sentiment(compound_score):
    if compound_score >= 0.340:
        return 'positive'
    elif compound_score <= -0.510:
        return 'negative'
    else:
        return 'neutral'

In [88]:
df['sentiment'] = df['compound'].apply(classify_sentiment)

In [89]:
df.sentiment

Unnamed: 0,sentiment
0,negative
1,positive
2,neutral
3,positive
4,positive
...,...
31957,neutral
31958,positive
31959,negative
31960,negative


In [90]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['lemmatized_text'])  # Features
y = df['sentiment']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [93]:
y_pred = clf.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression:
              precision    recall  f1-score   support

    negative       0.84      0.39      0.53       587
     neutral       0.77      0.92      0.84      2839
    positive       0.92      0.85      0.89      2967

    accuracy                           0.84      6393
   macro avg       0.84      0.72      0.75      6393
weighted avg       0.85      0.84      0.83      6393

Accuracy: 0.8390427029563585


In [94]:
new_text = [
    "I love this product! It works perfectly and is exactly what I needed.",
    "Terrible quality, broke after one use. Very disappointed.",
    "The product is okay, not great but not bad either.",
    "Excellent quality and fast shipping. Highly recommend!",
    "Not worth the money. It doesn't perform as advertised."
]

In [95]:
new_text_vec = vectorizer.transform(new_text)

In [96]:
predictions = clf.predict(new_text_vec)

In [97]:
probabilities = clf.predict_proba(new_text_vec)
for text, pred, prob in zip(new_text, predictions, probabilities):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {pred}")
    print(f"Probabilities: {dict(zip(clf.classes_, prob))}")
    print('---')

Text: I love this product! It works perfectly and is exactly what I needed.
Predicted Sentiment: positive
Probabilities: {'negative': np.float64(0.018572177995490907), 'neutral': np.float64(0.0734814031556422), 'positive': np.float64(0.9079464188488668)}
---
Text: Terrible quality, broke after one use. Very disappointed.
Predicted Sentiment: negative
Probabilities: {'negative': np.float64(0.8390986494609696), 'neutral': np.float64(0.09706657174617313), 'positive': np.float64(0.06383477879285733)}
---
Text: The product is okay, not great but not bad either.
Predicted Sentiment: negative
Probabilities: {'negative': np.float64(0.45090702559706153), 'neutral': np.float64(0.1367772102236481), 'positive': np.float64(0.41231576417929033)}
---
Text: Excellent quality and fast shipping. Highly recommend!
Predicted Sentiment: positive
Probabilities: {'negative': np.float64(0.04194821277074417), 'neutral': np.float64(0.39521910857891296), 'positive': np.float64(0.5628326786503429)}
---
Text: Not 