In [1]:
import pandas as pd
from numpy import NaN
import spacy
from transformers import pipeline
import re
import nltk
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [3]:
def preprocess_text(text):
    if isinstance(text, str):
        # remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # convert to lowercase
        text = text.lower()
        # tokenize text
        tokens = nltk.word_tokenize(text)
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        # lemmatize text
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # join tokens back into text
        text = ' '.join(tokens)
    return text


In [4]:
# Load the Excel file into a DataFrame
df = pd.read_csv(r"C:\Users\Lenovo\Desktop\dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment.csv",encoding='ISO-8859-1')

# Remove rows with "na" values
df = df.dropna(subset=['full_text'])

# Fill missing values in 'full_text' column with an empty string
df['full_text'] = df['full_text'].fillna('')
# to lower text
df['full_text'] = df['full_text'].str.lower()
# Preprocess the 'full_text' column
df['clean_text'] = df['full_text'].apply(preprocess_text)
# Filter the DataFrame to keep rows where "created_at" is greater than or equal to 2021-01-01
df = df[df['created_at'] >= '2021-01-01']
import datetime
# Convert the "created_at" column to datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

# Extract the date part from the datetime and convert it to the desired format
df['created_at'] = df['created_at'].dt.strftime("%Y-%m-%d")
##add importance_coefficient per tweets
df['importance_coefficient'] = df['retweet_count'] + 2 * df['favorite_count'] + 0.5 * df['reply_count']
# Find the minimum and maximum values of the importance coefficient
min_value = df['importance_coefficient'].min()
max_value = df['importance_coefficient'].max()

# Normalize the importance coefficient
df['importance_coefficient_normalized'] = (df['importance_coefficient'] - min_value) / (max_value - min_value)
# Sort the DataFrame based on the "created_at" column in ascending order
df = df.sort_values('created_at', ascending=True)

# Print the sorted DataFrame
df.head()


Unnamed: 0.1,Unnamed: 0,created_at,favorite_count,full_text,reply_count,retweet_count,user_id,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins
176,25551,2021-03-01,11,weã¢ââre hosting our first live webinar of t...,1,6,CoinDeskData,weãââre hosting first live webinar year wednes...,28.5,6.6e-05,"(eth,btc)"
177,24772,2021-03-01,39,dreams do come true! ã¢â¡los sueã£â±os se hace...,3,7,aantonop,dream come true ãâlos sueãâos se hacen realida...,86.5,0.000199,(bitcoin)
178,23426,2021-03-01,68,presenting at @harvardlawbfi's third annual co...,6,16,aantonop,presenting harvardlawbfis third annual confere...,155.0,0.000357,(amp)
179,24776,2021-03-01,121,"free, free, free #bitcoin workshop! did we men...",4,26,aantonop,free free free bitcoin workshop mention itãââs...,270.0,0.000621,(bitcoin)
180,25554,2021-03-01,9,in a month marked by the outstanding performan...,1,2,CoinDeskData,month marked outstanding performance cardanoãâ...,20.5,4.7e-05,"(eth,btc,ada)"


## Model 1: Aspect based sentiment analysis (RoBERTa)

In [6]:
# access_token = "Your_Token_in_hugging_face"
# # Function to extract aspects and sentiments
# def extract_aspects_sentiments(text):
#     # Load the spaCy English model for aspect extraction
#     nlp = spacy.load("en_core_web_sm")

#     # Extract aspects from the text
#     aspects = []
#     doc = nlp(text)
#     for token in doc:
#         if token.pos_ in ["NOUN", "PROPN"]:
#             aspects.append(token.text)

#     # Load the sentiment analysis model with your Hugging Face API token
#     sentiment_model = pipeline(
#         "sentiment-analysis",
#         model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
#         tokenizer="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
#         use_auth_token=access_token
#     )

#     # Extract sentiment for the entire text
#     sentiment_result = sentiment_model(text)[0]
#     overall_sentiment = sentiment_result["label"]
#     overall_score = sentiment_result["score"]

#     # Extract sentiment for each aspect
#     aspect_sentiments = []
#     for aspect in aspects:
#         aspect_text = text.replace(aspect, "<aspect>")
#         aspect_sentiment_result = sentiment_model(aspect_text)[0]
#         aspect_sentiment = aspect_sentiment_result["label"]
#         aspect_score = aspect_sentiment_result["score"]
#         aspect_sentiments.append((aspect, aspect_sentiment, aspect_score))

#     return overall_sentiment, overall_score, aspect_sentiments

# # Truncate the text to a maximum sequence length of 512 tokens
# df['truncated_text'] = df['clean_text'].str[:512]

# # Apply the extraction function to each row
# df[['overall_sentiment', 'overall_score', 'aspect_sentiments']] = df['truncated_text'].apply(extract_aspects_sentiments).apply(pd.Series)
 


## Model 2: Vader Sentiment analysis

In [7]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
df_selected =df.copy()
df_selected['scores'] = df_selected['full_text'].apply(lambda Description: sid.polarity_scores(Description))
df_selected.head()

Unnamed: 0.1,Unnamed: 0,created_at,favorite_count,full_text,reply_count,retweet_count,user_id,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,truncated_text,scores
176,25551,2021-03-01,11,weã¢ââre hosting our first live webinar of t...,1,6,CoinDeskData,weãââre hosting first live webinar year wednes...,28.5,6.6e-05,"(eth,btc)",weãââre hosting first live webinar year wednes...,"{'neg': 0.0, 'neu': 0.881, 'pos': 0.119, 'comp..."
177,24772,2021-03-01,39,dreams do come true! ã¢â¡los sueã£â±os se hace...,3,7,aantonop,dream come true ãâlos sueãâos se hacen realida...,86.5,0.000199,(bitcoin),dream come true ãâlos sueãâos se hacen realida...,"{'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'comp..."
178,23426,2021-03-01,68,presenting at @harvardlawbfi's third annual co...,6,16,aantonop,presenting harvardlawbfis third annual confere...,155.0,0.000357,(amp),presenting harvardlawbfis third annual confere...,"{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'comp..."
179,24776,2021-03-01,121,"free, free, free #bitcoin workshop! did we men...",4,26,aantonop,free free free bitcoin workshop mention itãââs...,270.0,0.000621,(bitcoin),free free free bitcoin workshop mention itãââs...,"{'neg': 0.0, 'neu': 0.708, 'pos': 0.292, 'comp..."
180,25554,2021-03-01,9,in a month marked by the outstanding performan...,1,2,CoinDeskData,month marked outstanding performance cardanoãâ...,20.5,4.7e-05,"(eth,btc,ada)",month marked outstanding performance cardanoãâ...,"{'neg': 0.0, 'neu': 0.753, 'pos': 0.247, 'comp..."


In [11]:
df_selected['compound'] = df_selected['scores'].apply(lambda score_dict: score_dict['compound'])
df_selected['sentiment_type']=''
df_selected.loc[df_selected.compound>0,'sentiment_type']='POSITIVE'
df_selected.loc[df_selected.compound==0,'sentiment_type']='NEUTRAL'
df_selected.loc[df_selected.compound<0,'sentiment_type']='NEGATIVE'

In [12]:
df_selected

Unnamed: 0.1,Unnamed: 0,created_at,favorite_count,full_text,reply_count,retweet_count,user_id,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,truncated_text,scores,compound,sentiment_type
176,25551,2021-03-01,11,weã¢ââre hosting our first live webinar of t...,1,6,CoinDeskData,weãââre hosting first live webinar year wednes...,28.5,0.000066,"(eth,btc)",weãââre hosting first live webinar year wednes...,"{'neg': 0.0, 'neu': 0.881, 'pos': 0.119, 'comp...",0.6124,POSITIVE
177,24772,2021-03-01,39,dreams do come true! ã¢â¡los sueã£â±os se hace...,3,7,aantonop,dream come true ãâlos sueãâos se hacen realida...,86.5,0.000199,(bitcoin),dream come true ãâlos sueãâos se hacen realida...,"{'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'comp...",0.7256,POSITIVE
178,23426,2021-03-01,68,presenting at @harvardlawbfi's third annual co...,6,16,aantonop,presenting harvardlawbfis third annual confere...,155.0,0.000357,(amp),presenting harvardlawbfis third annual confere...,"{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'comp...",0.7840,POSITIVE
179,24776,2021-03-01,121,"free, free, free #bitcoin workshop! did we men...",4,26,aantonop,free free free bitcoin workshop mention itãââs...,270.0,0.000621,(bitcoin),free free free bitcoin workshop mention itãââs...,"{'neg': 0.0, 'neu': 0.708, 'pos': 0.292, 'comp...",0.9150,POSITIVE
180,25554,2021-03-01,9,in a month marked by the outstanding performan...,1,2,CoinDeskData,month marked outstanding performance cardanoãâ...,20.5,0.000047,"(eth,btc,ada)",month marked outstanding performance cardanoãâ...,"{'neg': 0.0, 'neu': 0.753, 'pos': 0.247, 'comp...",0.7845,POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16417,1468,2023-06-12,173,ã°âââ¨the #litecoin halving is in 50 days ã°...,98,31,CryptoTony__,ãâââthe litecoin halving 50 day ãâââ httpstco4...,426.0,0.000980,(litecoin),ãâââthe litecoin halving 50 day ãâââ httpstco4...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,NEUTRAL
16418,883,2023-06-12,41,still not personally bidding any alts right no...,13,2,AltcoinSherpa,still personally bidding alt right dont know g...,90.5,0.000208,(btc),still personally bidding alt right dont know g...,"{'neg': 0.094, 'neu': 0.78, 'pos': 0.126, 'com...",0.4295,POSITIVE
16420,826,2023-06-12,193,latest: @sturdyfinance has just been exploited...,116,46,coingecko,latest sturdyfinance exploited 4426 eth httpst...,490.0,0.001127,(eth),latest sturdyfinance exploited 4426 eth httpst...,"{'neg': 0.25, 'neu': 0.75, 'pos': 0.0, 'compou...",-0.4588,NEGATIVE
16413,1753,2023-06-12,307,i guarantee you the conflicts of interests and...,8,53,JohnEDeaton1,guarantee conflict interest gross appearance i...,671.0,0.001544,"(ethereum,ripple)",guarantee conflict interest gross appearance i...,"{'neg': 0.171, 'neu': 0.737, 'pos': 0.092, 'co...",-0.5267,NEGATIVE
