In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')


# Read in data

In [2]:
mypath = 'tweets_first_run'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
len(onlyfiles)

1511

In [3]:
mypath = 'tweets_second_run'
onlyfiles2 = [f for f in listdir(mypath) if isfile(join(mypath, f))]
len(onlyfiles2)

1819

In [13]:
tweets = pd.DataFrame()

for f in tqdm(onlyfiles):
    try:
        tweet = pd.read_csv('tweets_first_run/{}'.format(f))
        tweets = tweets.append(tweet)
    except:
        print(f)

for f in tqdm(onlyfiles2):
    try:
        tweet = pd.read_csv('tweets_second_run/{}'.format(f))
        tweets = tweets.append(tweet)
    except:
        print(f)

100%|█████████████████████████████████████████████████████| 1511/1511 [02:19<00:00, 10.84it/s]
 14%|███████▊                                              | 262/1819 [00:50<03:36,  7.19it/s]

.DS_Store


100%|█████████████████████████████████████████████████████| 1819/1819 [09:48<00:00,  3.09it/s]


In [14]:
tweets.head()

Unnamed: 0.1,Username,User handle,Tweet,Date of posting,Text,Retweet count,Like count,Unnamed: 0
0,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163025...,2023-02-27 17:19:55+00:00,@Tleubayev @LostDogsCo Who? Who? Who? 👀,0,2,
1,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163022...,2023-02-27 15:21:22+00:00,More details about the new collection here⬇️\n...,0,1,
2,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163022...,2023-02-27 15:21:20+00:00,Remember we asked you to guess who's in the pi...,3,7,
3,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163021...,2023-02-27 14:35:23+00:00,🗞 https://t.co/frQpwixZ5s @ston_fi had a prod...,0,2,
4,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163021...,2023-02-27 14:35:23+00:00,🗞 Fanzee @fanzeelabs token got listed on @Coin...,0,1,


In [20]:
select_columns = ['Username', 'User handle', 'Tweet', 'Date of posting',
       'Text', 'Retweet count', 'Like count']

In [21]:
tweets = tweets[select_columns]

In [22]:
tweets = tweets.drop_duplicates()
tweets.head()

Unnamed: 0,Username,User handle,Tweet,Date of posting,Text,Retweet count,Like count
0,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163025...,2023-02-27 17:19:55+00:00,@Tleubayev @LostDogsCo Who? Who? Who? 👀,0,2
1,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163022...,2023-02-27 15:21:22+00:00,More details about the new collection here⬇️\n...,0,1
2,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163022...,2023-02-27 15:21:20+00:00,Remember we asked you to guess who's in the pi...,3,7
3,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163021...,2023-02-27 14:35:23+00:00,🗞 https://t.co/frQpwixZ5s @ston_fi had a prod...,0,2
4,Getgems TON NFT Marketplace,getgemsdotio,https://twitter.com/getgemsdotio/status/163021...,2023-02-27 14:35:23+00:00,🗞 Fanzee @fanzeelabs token got listed on @Coin...,0,1


In [17]:
tweets.shape

(2280945, 7)

In [23]:
tweets.to_csv('data/all_tweets.csv', index = 0)

# Text preprocessing

In [4]:
import re
import spacy
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luobingqiao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
tweets = pd.read_csv('data/all_tweets.csv')
tweets = tweets.head(100)
tweets.shape

(100, 7)

In [6]:
# clean tweets
def clean_tweet(tweet):
    # Convert to string if not a string
    tweet = str(tweet)
    
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)

    # Remove user mentions
    tweet = re.sub(r"@\w+", "", tweet)

    # Remove special characters
    tweet = re.sub(r"\W", " ", tweet)

    # Remove extra spaces
    tweet = re.sub(r"\s+", " ", tweet)

    return tweet.strip()


In [7]:
tweets['Text']

0               @Tleubayev @LostDogsCo Who? Who? Who? 👀
1     More details about the new collection here⬇️\n...
2     Remember we asked you to guess who's in the pi...
3     🗞 https://t.co/frQpwixZ5s  @ston_fi had a prod...
4     🗞 Fanzee @fanzeelabs token got listed on @Coin...
                            ...                        
95                          @0xBrigandine @loomdart lol
96              @DanishCryptoDK @MORBS15 this is sick!🤩
97                @Tleubayev you can't be too careful 😄
98    gm frens! now that Jan activity with eggs is f...
99           @Tleubayev 🫡🥰 thanks for the support, fam!
Name: Text, Length: 100, dtype: object

In [9]:
tweets['clean_text'] = tweets['Text'].apply(clean_tweet)
tweets['clean_text']

0                                           Who Who Who
1            More details about the new collection here
2     Remember we asked you to guess who s in the pi...
3     had a product update The profitability of liqu...
4     Fanzee token got listed on After joining the p...
                            ...                        
95                                                  lol
96                                         this is sick
97                             you can t be too careful
98    gm frens now that Jan activity with eggs is fi...
99                           thanks for the support fam
Name: clean_text, Length: 100, dtype: object

In [10]:
# preprocess tweets
def preprocess_tweet(tweet):

    # Tokenize the tweet
    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatized_tokens = []
    for token in tokens:
        doc = nlp(token)
        lemmatized_tokens.append([t.lemma_ for t in doc][0])

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]

    return stemmed_tokens


In [11]:
tweets['tokenize_text'] = tweets['clean_text'].apply(preprocess_tweet)
tweets['tokenize_text']

0                                                    []
1                                [detail, new, collect]
2     [rememb, ask, guess, pictur, yep, adam, eve, d...
3     [product, updat, profit, liquid, pool, import,...
4     [fanze, token, get, list, join, project, team,...
                            ...                        
95                                                [lol]
96                                               [sick]
97                                               [care]
98    [gm, fren, jan, activ, egg, finish, suggest, i...
99                                [thank, support, fam]
Name: tokenize_text, Length: 100, dtype: object

# Sentiment analysis

In [12]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer

import nltk
nltk.download('vader_lexicon')

# VADER
def analyze_sentiment_vader(tweet):
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(' '.join(tweet))
    return sentiment_score

There was a problem when trying to write in your cache folder (/Users/luobingqiao/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/luobingqiao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
tweets['vader'] = tweets['tokenize_text'].apply(analyze_sentiment_vader)
tweets['vader']

0     {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound...
1     {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
2     {'neg': 0.087, 'neu': 0.841, 'pos': 0.071, 'co...
3     {'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp...
4     {'neg': 0.0, 'neu': 0.657, 'pos': 0.343, 'comp...
                            ...                        
95    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...
96    {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound...
97    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...
98    {'neg': 0.0, 'neu': 0.679, 'pos': 0.321, 'comp...
99    {'neg': 0.0, 'neu': 0.161, 'pos': 0.839, 'comp...
Name: vader, Length: 100, dtype: object

In [45]:
import os
import tempfile
from transformers import BertTokenizer, BertForSequenceClassification

local_model_path = "./bert_sst2_model"
tokenizer = BertTokenizer.from_pretrained(local_model_path)
model = BertForSequenceClassification.from_pretrained(local_model_path)

In [50]:
def predict_sentiment(tweet):
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    sentiment = torch.argmax(logits, dim=1).item()

    return "positive" if sentiment == 1 else "negative"

tweets['bert'] = tweets['clean_text'].apply(predict_sentiment)
tweets['bert']


0     negative
1     positive
2     positive
3     negative
4     negative
        ...   
95    negative
96    negative
97    positive
98    negative
99    positive
Name: bert, Length: 100, dtype: object

In [61]:
# SentiWSP
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

local_model_path = "./senti_WSP"

def analyze_sentiment_sentiwsp(tweet):
    tokenizer = BertTokenizer.from_pretrained(local_model_path)
    model = BertForSequenceClassification.from_pretrained(local_model_path)

    inputs = tokenizer(tweet, return_tensors="pt")
    output = model(**inputs)
    logits = output.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    sentiment_class = torch.argmax(probabilities).item()

    sentiment_mapping = {0: "negative", 1: "neutral", 2: "positive"}
    sentiment = sentiment_mapping[sentiment_class]

    return sentiment


This function, analyze_sentiment_sentiwsp, takes a preprocessed tweet as input, tokenizes it using the SentiWSP tokenizer, and performs sentiment analysis using the pre-trained SentiWSP model from Hugging Face. The function returns the sentiment class (negative, neutral, or positive) based on the highest probability.

In the example usage section, the function is applied to a sample preprocessed tweet, and the sentiment result from SentiWSP is printed.

In [64]:
tweets['sentiwsp'] = tweets['clean_text'].apply(analyze_sentiment_sentiwsp)
tweets['sentiwsp']

0      neutral
1      neutral
2     negative
3     negative
4     negative
        ...   
95     neutral
96    negative
97    negative
98     neutral
99     neutral
Name: sentiwsp, Length: 100, dtype: object