In [1]:
# Importing necessary libraries for text processing, sentiment analysis, and machine learning.
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, sentiwordnet as swn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import spacy
from nltk.corpus import sentiwordnet as swn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


nltk.download('sentiwordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('sentiwordnet')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package sentiwordnet to C:\Users\Aparna
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Aparna
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aparna
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to C:\Users\Aparna
[nltk_data]     Shankar\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [2]:
# Initialize stop words and lemmatizer.
# Define a function for preprocessing text data, which includes:
# - Removing URLs, mentions, hashtags, and special characters
# - Converting text to lowercase
# - Tokenizing, removing stop words, and lemmatizing tokens
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLs
    tweet = re.sub(r"@\w+", "", tweet)    # Remove mentions
    tweet = re.sub(r"#\w+", "", tweet)    # Remove hashtags
    tweet = re.sub(r"[^\w\s]", "", tweet) # Remove special characters
    tweet = tweet.lower()                     # Convert to lowercase
    tokens = word_tokenize(tweet)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize the tokens
    return tokens

In [3]:
# Define a function to classify text as subjective or objective based on SentiWordNet.
# For each token, check its positive and negative sentiment scores using SentiWordNet.
# If a word has a positive or negative score, classify it as subjective.

# Function to classify subjectivity
def is_subjective(tokens):
    for word in tokens:
        synsets = list(swn.senti_synsets(word))
        if synsets:
            sentiment = synsets[0]
            if sentiment.pos_score() > 0 or sentiment.neg_score() > 0:
                return True  # Returns Subjective
    return False  # Returns Objective


# Function to extract semantic associations
def semantic_association(tokens):
    doc = nlp(" ".join(tokens))
    associations = []
    for token in doc:
        if token.pos_ in ['VERB', 'ADJ','NOUN','ADV']:
            associations.append((token.text, token.head.text))
    return associations

In [4]:
# Function to classify polarity
def polarity_classification(associations):
    sentiment_score = 0
    for word, subject in associations:
        synsets = list(swn.senti_synsets(word))
        if synsets:
            sentiment = synsets[0]
            sentiment_score += sentiment.pos_score() - sentiment.neg_score()
    return 4 if sentiment_score > 0 else 0

In [5]:
# Load dataset
dataset_path = "Twitter_data.csv"  
data = pd.read_csv(dataset_path, sep=",",encoding='ISO-8859-1',  names=["polarity", "id", "date", "query", "user", "text"])
data = data.sample(n=50000)
data.head(10)

Unnamed: 0,polarity,id,date,query,user,text
1419463,4,2057973787,Sat Jun 06 14:01:13 PDT 2009,NO_QUERY,Armano,"@jbell99 hey, don't hold back, go for it. That..."
908428,4,1696215889,Mon May 04 07:50:47 PDT 2009,NO_QUERY,SickzEight,"It's a good day! The lake is a mirror, the kid..."
83816,0,1753447295,Sun May 10 01:41:01 PDT 2009,NO_QUERY,ncallender,Is feeling that my heart is in two different p...
935282,4,1792695875,Wed May 13 23:56:36 PDT 2009,NO_QUERY,neenz,@CindySJ We're fine.
1484105,4,2067606205,Sun Jun 07 12:38:31 PDT 2009,NO_QUERY,jaybranch,@fionajc3 It's quite cool.... and very easy. ...
1128598,4,1975377890,Sat May 30 15:01:55 PDT 2009,NO_QUERY,Tammy09x,@ReeceNoi aww :-| will you be coming back to w...
1223253,4,1990386823,Mon Jun 01 04:43:51 PDT 2009,NO_QUERY,linabrynn,"for once, im pretty glad its monday exactly t..."
1466672,4,2064506512,Sun Jun 07 06:27:21 PDT 2009,NO_QUERY,chrisoakley,"@kkfla737 Very good, thank you! That said, it..."
512360,0,2190121240,Tue Jun 16 01:49:44 PDT 2009,NO_QUERY,MorgaineNYC,"@kittyy79 Sadly, my client had a different idea."
1189098,4,1983452222,Sun May 31 13:10:58 PDT 2009,NO_QUERY,itsM0RGAN,baccalaurate? that's a complete guess


In [6]:
# Define a function to classify text as subjective or objective based on SentiWordNet.
# For each token, check its positive and negative sentiment scores using SentiWordNet.
# If a word has a positive or negative score, classify it as subjective.

# Preprocess tweets
data = data[["polarity", "text"]]
data["tokens"] = data["text"].apply(preprocess_tweet)

# Apply custom sentiment system
data["subjective"] = data["tokens"].apply(is_subjective)
data["associations"] = data["tokens"].apply(semantic_association)
data["predicted_polarity"] = data["associations"].apply(polarity_classification)

In [12]:
data["associations"]

1419463    [(hold, hold), (go, hold), (s, hold), (youtube...
908428     [(good, day), (day, happy), (happy, get), (get...
83816      [(feeling, ve), (heart, feeling), (different, ...
935282                                        [(fine, fine)]
1484105    [(quite, cool), (cool, info), (easy, visit), (...
                                 ...                        
147561     [(stop, stop), (asking, stop), (play, football...
47059      [(condom, aid), (aid, thing), (thing, m), (tru...
184243     [(there, nothing), (available, nothing), (phx,...
265997     [(seen, training), (training, starting), (star...
903268                                           [(sho, fo)]
Name: associations, Length: 50000, dtype: object

In [16]:
data.head(10)

Unnamed: 0,polarity,text,tokens,subjective,associations,predicted_polarity
1419463,4,"@jbell99 hey, don't hold back, go for it. That...","[hey, dont, hold, back, go, thats, youtube]",False,"[(hold, hold), (go, hold), (s, hold), (youtube...",0
908428,4,"It's a good day! The lake is a mirror, the kid...","[good, day, lake, mirror, kid, happy, get, cle...",True,"[(good, day), (day, happy), (happy, get), (get...",4
83816,0,Is feeling that my heart is in two different p...,"[feeling, heart, two, different, place, today,...",True,"[(feeling, ve), (heart, feeling), (different, ...",4
935282,4,@CindySJ We're fine.,[fine],True,"[(fine, fine)]",0
1484105,4,@fionajc3 It's quite cool.... and very easy. ...,"[quite, cool, easy, visit, info]",True,"[(quite, cool), (cool, info), (easy, visit), (...",0
1128598,4,@ReeceNoi aww :-| will you be coming back to w...,"[aww, coming, back, waterloo, road, x]",False,"[(aww, coming), (coming, coming), (back, comin...",0
1223253,4,"for once, im pretty glad its monday exactly t...","[im, pretty, glad, monday, exactly, two, month...",True,"[(m, m), (pretty, glad), (glad, m), (exactly, ...",4
1466672,4,"@kkfla737 Very good, thank you! That said, it...","[good, thank, said, totally, wrong, time, zone...",True,"[(thank, said), (said, said), (totally, wrong)...",0
512360,0,"@kittyy79 Sadly, my client had a different idea.","[sadly, client, different, idea]",True,"[(sadly, client), (client, client), (different...",4
1189098,4,baccalaurate? that's a complete guess,"[baccalaurate, thats, complete, guess]",True,"[(baccalaurate, baccalaurate), (s, baccalaurat...",0


In [7]:
# Evaluate Proposed System
proposed_accuracy = accuracy_score(data["polarity"], data["predicted_polarity"])
proposed_precision = precision_score(data["polarity"], data["predicted_polarity"], average="weighted")
proposed_recall = recall_score(data["polarity"], data["predicted_polarity"], average="weighted")
proposed_f1 = f1_score(data["polarity"], data["predicted_polarity"], average="weighted")

print("Proposed System:")
print(f"Accuracy: {proposed_accuracy:.4f}")
print(f"Precision: {proposed_precision:.4f}")
print(f"Recall: {proposed_recall:.4f}")
print(f"F1-Score: {proposed_f1:.4f}")

Proposed System:
Accuracy: 0.5634
Precision: 0.5642
Recall: 0.5634
F1-Score: 0.5624


In [8]:
# Machine Learning benchmarking
vectorizer = CountVectorizer(analyzer=lambda x: x)
X = vectorizer.fit_transform(data["tokens"])
y = data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, lr_preds))

Logistic Regression:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74      5015
           4       0.73      0.77      0.75      4985

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



In [9]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)
print("Naive Bayes:\n", classification_report(y_test, nb_preds))

Naive Bayes:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74      5015
           4       0.74      0.72      0.73      4985

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



In [10]:
# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("Support Vector Machine:\n", classification_report(y_test, svm_preds))

Support Vector Machine:
               precision    recall  f1-score   support

           0       0.77      0.71      0.74      5015
           4       0.73      0.78      0.76      4985

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

