In [1]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

# Disable the symlink warning
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Ensure the Kaggle API credentials are set up
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')

# Download a sample dataset
!kaggle datasets download -d kazanova/sentiment140

# Unzip the dataset
import zipfile
with zipfile.ZipFile("sentiment140.zip","r") as zip_ref:
    zip_ref.extractall(".")

import pandas as pd
from transformers import pipeline

# Load the dataset
tweets_df = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                        encoding='latin1', 
                        header=None, 
                        usecols=[0, 5], 
                        names=['sentiment', 'text'])

# Map sentiment to readable labels
sentiment_map = {0: 'NEGATIVE', 4: 'POSITIVE'}
tweets_df['sentiment'] = tweets_df['sentiment'].map(sentiment_map)

# Create a smaller DataFrame with the top 200 tweets
sample_tweets_df = tweets_df.head(200).copy()

# Load the sentiment-analysis pipeline with a specified model
sentiment_pipeline = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english",
    revision="af0f99b"
)

# Function to perform sentiment analysis
def analyze_sentiment(texts):
    results = sentiment_pipeline(texts)
    return results

# Perform sentiment analysis on the top 200 tweets
tweet_texts = sample_tweets_df['text'].tolist()
sentiments = analyze_sentiment(tweet_texts)

# Add sentiments to the smaller DataFrame
sample_tweets_df['bert_sentiment'] = [result['label'] for result in sentiments]
sample_tweets_df['bert_sentiment_score'] = [result['score'] for result in sentiments]

# Save the results to CSV
sample_tweets_df.to_csv('tweets_with_bert_sentiments.csv', index=False)


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)
