In [1]:
from google.colab import files
import pandas as pd

# Upload from device
uploaded = files.upload()

# Extract filename
filename = list(uploaded.keys())[0]

# Load CSV
df = pd.read_csv(filename)

# Sample 500 rows
df_sample = df.sample(n=500, random_state=42).copy()

# Preview
df_sample.head()

Saving tweets-data.csv to tweets-data.csv


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
2899,897,2023-06-25 11:06:23+00:00,2,,Le #DessinDePresse de Sanaga : ls sont morts c...,titan
594,594,2023-06-25 18:23:19+00:00,0,,#Russia #Wagner #RussiaCivilWar https://t.co/P...,wagner
2870,868,2023-06-25 11:32:00+00:00,1,,Exclusive content -https://t.co/oEiSIIB2Z1\n.\...,titan
52,52,2023-06-25 19:11:12+00:00,21,,Auch heute geht die politische Nachricht des T...,wagner
1391,390,2023-06-25 16:21:52+00:00,1,,@crazyclipsonly Same type that would take a ho...,titanic


In [2]:
!pip install nltk

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import string

# Initialize VADER
sia = SentimentIntensityAnalyzer()



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [3]:
def get_vader_sentiment(text):
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        sentiment = 'positive'
    elif compound <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment, compound

In [5]:
print(df_sample.columns)

Index(['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet',
       'Tweets', 'hashtag'],
      dtype='object')


**Replacement**

In [14]:
print(df_sample.columns)

Index(['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet',
       'Tweets', 'hashtag', 'cleaned_text'],
      dtype='object')


In [16]:
df_sample["cleaned_text"] = df_sample["Tweets"].apply(clean_text)


In [17]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#\w+", "", text)     # remove hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Check your columns first
print(df_sample.columns)

# Replace 'text' with your actual column name
df_sample["cleaned_text"] = df_sample["Tweets"].apply(clean_text)


Index(['Unnamed: 0', 'Date Created', 'Number of Likes', 'Source of Tweet',
       'Tweets', 'hashtag', 'cleaned_text'],
      dtype='object')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# Apply VADER sentiment function
df_sample["vader_sentiment"], df_sample["vader_score"] = zip(*df_sample["cleaned_text"].apply(get_vader_sentiment))

# Preview
df_sample[["Tweets", "cleaned_text", "vader_sentiment", "vader_score"]].head()


Unnamed: 0,Tweets,cleaned_text,vader_sentiment,vader_score
2899,Le #DessinDePresse de Sanaga : ls sont morts c...,le de sanaga ls sont morts comme ils ont vécu ...,neutral,0.0
594,#Russia #Wagner #RussiaCivilWar https://t.co/P...,,neutral,0.0
2870,Exclusive content -https://t.co/oEiSIIB2Z1\n.\...,exclusive content,positive,0.128
52,Auch heute geht die politische Nachricht des T...,auch heute geht die politische nachricht des t...,negative,-0.5994
1391,@crazyclipsonly Same type that would take a ho...,type would take homemade playstationcontrolled...,neutral,0.0


In [20]:
!pip install transformers

from transformers import pipeline



In [21]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Choose the same model used in your pipeline
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create pipeline with truncation enabled
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cpu


In [28]:
def get_transformer_sentiment(text):
    inputs = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    score, predicted_class = probs.max(dim=1)
    label = model.config.id2label[predicted_class.item()]
    return label, score.item()

We apply safely over our dataframe

In [29]:
# Apply with progress bar for large datasets
from tqdm import tqdm
tqdm.pandas()

df_sample["transformer_sentiment"], df_sample["transformer_score"] = zip(*df_sample["cleaned_text"].progress_apply(get_transformer_sentiment))

100%|██████████| 500/500 [01:07<00:00,  7.40it/s]


In [30]:
# Select useful columns
final_df = df_sample[["cleaned_text", "transformer_sentiment", "transformer_score", "vader_sentiment", "vader_score"]]

# Preview
final_df.head()

Unnamed: 0,cleaned_text,transformer_sentiment,transformer_score,vader_sentiment,vader_score
2899,le de sanaga ls sont morts comme ils ont vécu ...,NEGATIVE,0.79976,neutral,0.0
594,,POSITIVE,0.748121,neutral,0.0
2870,exclusive content,POSITIVE,0.998467,positive,0.128
52,auch heute geht die politische nachricht des t...,NEGATIVE,0.977131,negative,-0.5994
1391,type would take homemade playstationcontrolled...,NEGATIVE,0.992334,neutral,0.0
