In [2]:
!pip install nltk gensim vaderSentiment numpy

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m27.9/27.9 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m126.0/126.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment, gensim
Successfully installed gensim-4.4.0 vaderSentiment-3.3.2


In [7]:
# Install necessary libraries
!pip install nltk gensim vaderSentiment numpy

import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gensim.models import Word2Vec, FastText, KeyedVectors
import numpy as np

# Download NLTK resources
nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng') # Changed to 'averaged_perceptron_tagger_eng' as suggested by error
nltk.download('punkt_tab')

# ------------------------
# Lexicon-based Functions
# ------------------------

# VADER analyzer
vader_analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    return vader_analyzer.polarity_scores(text)

# SentiWordNet analyzer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def sentiwordnet_sentiment(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    sentiment_score = 0
    count = 0
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            synsets = list(wordnet.synsets(word, pos=wn_tag))
            if synsets:
                synset = synsets[0]
                swn_synset = swn.senti_synset(synset.name())
                sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
                count += 1
    return sentiment_score / count if count != 0 else 0

# ------------------------
# Traditional Embedding Models
# ------------------------

# Sample sentences for training Word2Vec & FastText
sample_texts = [
    "I love machine learning",
    "This product is terrible",
    "The movie was fantastic",
    "I'm not happy with this service",
    "Absolutely amazing experience"
]

tokenized_texts = [word_tokenize(sent.lower()) for sent in sample_texts]

# Train Word2Vec
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)

# Train FastText
fasttext_model = FastText(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)

# Load GloVe embeddings
# Download glove.6B.50d.txt from https://nlp.stanford.edu/projects/glove/
glove_file = 'glove.6B.50d.txt'

# Download GloVe file if not present
import os
if not os.path.exists(glove_file):
    print(f"Downloading {glove_file}...")
    !wget -P . https://nlp.stanford.edu/data/glove.6B.zip
    !unzip -o glove.6B.zip

glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Function to compute sentence embedding
def sentence_embedding(tokens, model):
    vecs = [model[word] for word in tokens if word in model]
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

# ------------------------
# Full Pipeline Function
# ------------------------
def analyze_text(text):
    tokens = word_tokenize(text.lower())

    result = {
        'text': text,
        'vader': vader_sentiment(text),
        'sentiwordnet': sentiwordnet_sentiment(text),
        'word2vec_embedding': sentence_embedding(tokens, word2vec_model.wv),
        'fasttext_embedding': sentence_embedding(tokens, fasttext_model.wv),
        'glove_embedding': sentence_embedding(tokens, glove_model)
    }
    return result

# ------------------------
# Example Usage
# ------------------------
texts = [
    "I love this product! It's amazing üòä",
    "The service was terrible and I hate it",
    "I'm not sure how I feel about this"
]

for t in texts:
    output = analyze_text(t)
    print(f"Text: {output['text']}")
    print(f"VADER: {output['vader']}")
    print(f"SentiWordNet: {output['sentiwordnet']}")
    print(f"Word2Vec Embedding Shape: {output['word2vec_embedding'].shape}")
    print(f"FastText Embedding Shape: {output['fasttext_embedding'].shape}")
    print(f"GloVe Embedding Shape: {output['glove_embedding'].shape}\n")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Text: I love this product! It's amazing üòä
VADER: {'neg': 0.0, 'neu': 0.329, 'pos': 0.671, 'compound': 0.9359}
SentiWordNet: 0.25
Word2Vec Embedding Shape: (50,)
FastText Embedding Shape: (50,)
GloVe Embedding Shape: (50,)

Text: The service was terrible and I hate it
VADER: {'neg': 0.531, 'neu': 0.469, 'pos': 0.0, 'compound': -0.7783}
SentiWordNet: -0.3125
Word2Vec Embedding Shape: (50,)
FastText Embedding Shape: (50,)
GloVe Embedding Shape: (50,)

Text: I'm not sure how I feel about this
VADER: {'neg': 0.219, 'neu': 0.781, 'pos': 0.0, 'compound': -0.2411}
SentiWordNet: -0.3333333333333333
Word2Vec Embedding Shape: (50,)
FastText Embedding Shape: (50,)
GloVe Embedding Shape: (50,)



In [8]:
# Download dataset from T4SA
!wget --user=t4sa --password=U4Cm_dUa http://www.t4sa.it/dataset/t4sa_text_sentiment.tsv
!wget --user=t4sa --password=U4Cm_dUa http://www.t4sa.it/dataset/raw_tweets_text.csv

import pandas as pd

# Load datasets
sent_df = pd.read_csv('t4sa_text_sentiment.tsv', sep='\t')
sent_df['TWID'] = sent_df['TWID'].astype(str)

text_df = pd.read_csv('raw_tweets_text.csv')
text_df['id'] = text_df['id'].astype(str)

# Merge on IDs
merged_df = pd.merge(sent_df, text_df, left_on='TWID', right_on='id', how='inner')
merged_df = merged_df.drop(columns=['id'])

# Assign numeric label: NEG=0, NEU=1, POS=2
merged_df['label'] = merged_df[['NEG', 'NEU', 'POS']].idxmax(axis=1).map({
    'NEG': 0,
    'NEU': 1,
    'POS': 2
})

# Rename columns
merged_df = merged_df.rename(columns={'TWID': 'twitter_id', 'text': 'content'})

# Final dataset
final_df = merged_df[['twitter_id', 'label', 'content', 'NEG', 'NEU', 'POS']]

# Preview
print("Dataset size:", len(final_df))
print("\nFirst 20 rows (twitter_id, label, content):")
print(final_df[['twitter_id', 'label', 'content']].head(20))

print("\nSentiment label distribution:")
print(final_df['label'].value_counts().sort_index())

# Save merged dataset
final_df.to_csv('t4sa_merged_text_sentiment.csv', index=False)
print("\nMerged dataset saved as 't4sa_merged_text_sentiment.csv'")


--2025-12-14 07:15:12--  http://www.t4sa.it/dataset/t4sa_text_sentiment.tsv
Resolving www.t4sa.it (www.t4sa.it)... 146.48.85.151
Connecting to www.t4sa.it (www.t4sa.it)|146.48.85.151|:80... connected.
HTTP request sent, awaiting response... 401 Authorization Required
Authentication selected: Basic realm="T4SA Dataset"
Reusing existing connection to www.t4sa.it:80.
HTTP request sent, awaiting response... 200 OK
Length: 77271921 (74M) [text/tab-separated-values]
Saving to: ‚Äòt4sa_text_sentiment.tsv‚Äô


2025-12-14 07:15:20 (9.95 MB/s) - ‚Äòt4sa_text_sentiment.tsv‚Äô saved [77271921/77271921]

--2025-12-14 07:15:20--  http://www.t4sa.it/dataset/raw_tweets_text.csv
Resolving www.t4sa.it (www.t4sa.it)... 146.48.85.151
Connecting to www.t4sa.it (www.t4sa.it)|146.48.85.151|:80... connected.
HTTP request sent, awaiting response... 401 Authorization Required
Authentication selected: Basic realm="T4SA Dataset"
Reusing existing connection to www.t4sa.it:80.
HTTP request sent, awaiting response..

In [9]:
# Install necessary libraries
!pip install nltk gensim vaderSentiment numpy pandas tabulate

import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gensim.models import Word2Vec, FastText, KeyedVectors
import numpy as np
import pandas as pd
from tabulate import tabulate
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# ------------------------
# Lexicon-based Functions
# ------------------------
vader_analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    scores = vader_analyzer.polarity_scores(text)
    # Convert compound score to discrete label: 0=NEG, 1=NEU, 2=POS
    compound = scores['compound']
    if compound > 0.05:
        label = 2
    elif compound < -0.05:
        label = 0
    else:
        label = 1
    return label

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return None

def sentiwordnet_sentiment(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    sentiment_score = 0
    count = 0
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            synsets = list(wordnet.synsets(word, pos=wn_tag))
            if synsets:
                swn_synset = swn.senti_synset(synsets[0].name())
                sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
                count += 1
    avg_score = sentiment_score / count if count != 0 else 0
    # Convert to discrete label
    if avg_score > 0.05:
        return 2
    elif avg_score < -0.05:
        return 0
    else:
        return 1

# ------------------------
# Traditional Embedding Models
# ------------------------
# Load merged dataset
df = pd.read_csv('t4sa_merged_text_sentiment.csv')
sample_texts = df['content'].tolist()
tokenized_texts = [word_tokenize(sent.lower()) for sent in sample_texts]

# Word2Vec & FastText
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)
fasttext_model = FastText(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)

# GloVe embeddings
glove_file = 'glove.6B.50d.txt'
if not os.path.exists(glove_file):
    print(f"Downloading {glove_file}...")
    !wget -P . https://nlp.stanford.edu/data/glove.6B.zip
    !unzip -o glove.6B.zip

glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

def sentence_embedding(tokens, model):
    vecs = [model[word] for word in tokens if word in model]
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

def embedding_sentiment(tokens, model):
    vec = sentence_embedding(tokens, model)
    # For evaluation, use simple heuristic: sum(vector)>0 ‚Üí POS, <0 ‚Üí NEG, else NEU
    s = vec.sum()
    if s > 0.05:
        return 2
    elif s < -0.05:
        return 0
    else:
        return 1

# ------------------------
# Evaluation Table
# ------------------------
results = []
for text, true_label in zip(df['content'][:50], df['label'][:50]):  # Limit to first 50 for speed
    tokens = word_tokenize(text.lower())
    vader_label = vader_sentiment(text)
    swn_label = sentiwordnet_sentiment(text)
    w2v_label = embedding_sentiment(tokens, word2vec_model.wv)
    ft_label = embedding_sentiment(tokens, fasttext_model.wv)
    glove_label = embedding_sentiment(tokens, glove_model)

    results.append([text, true_label, vader_label, swn_label, w2v_label, ft_label, glove_label])

# Display as table
headers = ['Text', 'True', 'VADER', 'SentiWordNet', 'Word2Vec', 'FastText', 'GloVe']
print(tabulate(results, headers=headers, tablefmt='grid'))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


+-----------------------------------------------------------------------------------------------------------------------------------------------------------+--------+---------+----------------+------------+------------+---------+
| Text                                                                                                                                                      |   True |   VADER |   SentiWordNet |   Word2Vec |   FastText |   GloVe |
| #Incredible #India #Atulya #Bharat - Land of Seekers #BeProud üôè üáÆüá≥  :|: Plz RT https://t.co/vpghReZWsa                                                   |      1 |       2 |              1 |          2 |          0 |       2 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+--------+---------+----------------+------------+------------+---------+
| RT @AlwaysTrustKay: Are you near a Western union &amp; want to make up