# Import and load csv

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from transformers import pipeline
import pandas as pd

In [3]:
tweet_df = pd.read_csv("filtered_tweet_data.csv")
tweet_df.head()

Unnamed: 0.1,Unnamed: 0,post_date,body,like_num,ticker_symbol
0,0,2015-01-01,Jeff Bezos lost $7.4 billion in Amazon's worst...,57,AMZN
1,1,2015-01-01,"Earlier this month, a mysterious glitch caused...",17,AAPL
2,2,2015-01-01,Jeff Bezos lost $7.4 billion in Amazon's worst...,57,AMZN
3,3,2015-01-01,"New Post - ""Apple Stock Pullback: Price Target...",7,AAPL
4,4,2015-01-01,2015 technology forecasts: Wearable technology...,11,AAPL


# Classifier

In [4]:
classifier = pipeline("sentiment-analysis", model="ProsusAI/finbert")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
def get_finbert_sentiment(text):
    result = classifier(text, truncation=True, padding=True)
    label = result[0]["label"]
    confidence = result[0]["score"]

    if label.lower() == "positive":
        sentiment = "bullish"
    elif label.lower() == "negative":
        sentiment = "bearish"
    else:
        sentiment = "neutral"

    return pd.Series([sentiment, confidence])

In [6]:
apple_df = tweet_df[tweet_df['ticker_symbol'] == 'AAPL']
apple_sample_df = apple_df.head(50)
apple_sample_df.head()

Unnamed: 0.1,Unnamed: 0,post_date,body,like_num,ticker_symbol
1,1,2015-01-01,"Earlier this month, a mysterious glitch caused...",17,AAPL
3,3,2015-01-01,"New Post - ""Apple Stock Pullback: Price Target...",7,AAPL
4,4,2015-01-01,2015 technology forecasts: Wearable technology...,11,AAPL
9,9,2015-01-01,"We're shocked, shocked! that Samsung might try...",9,AAPL
10,10,2015-01-01,"New hed, same story: The Apple press goes off ...",8,AAPL


In [7]:
from tqdm import tqdm
tqdm.pandas()

def safe_get_sentiment(text):
    try:
        sentiment_category, confidence_score = get_finbert_sentiment(text)
    except Exception:
        sentiment_category, confidence_score = "neutral", 0
    return sentiment_category, confidence_score

tweet_df[['sentiment_category', 'confidence_score']] = tweet_df['body'].progress_apply(
    lambda x: pd.Series(safe_get_sentiment(x))
)

tweet_df.head()
tweet_df.to_csv("processed_tweets.csv", index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 79%|███████▊  | 214456/272531 [32:48<09:09, 105.67it/s][A
 79%|███████▊  | 214467/272531 [32:48<09:09, 105.69it/s][A
 79%|███████▊  | 214479/272531 [32:48<09:00, 107.37it/s][A
 79%|███████▊  | 214490/272531 [32:48<09:03, 106.85it/s][A
 79%|███████▊  | 214502/272531 [32:48<08:54, 108.48it/s][A
 79%|███████▊  | 214513/272531 [32:48<08:57, 107.93it/s][A
 79%|███████▊  | 214525/272531 [32:48<08:49, 109.59it/s][A
 79%|███████▊  | 214537/272531 [32:48<08:47, 110.02it/s][A
 79%|███████▊  | 214549/272531 [32:48<08:45, 110.26it/s][A
 79%|███████▊  | 214561/272531 [32:49<08:46, 110.18it/s][A
 79%|███████▊  | 214573/272531 [32:49<08:41, 111.12it/s][A
 79%|███████▊  | 214585/272531 [32:49<08:36, 112.20it/s][A
 79%|███████▊  | 214597/272531 [32:49<08:41, 111.02it/s][A
 79%|███████▊  | 214609/272531 [32:49<08:37, 111.85it/s][A
 79%|███████▉  | 214621/272531 [32:49<08:39, 111.37it/s][A
 79%|███████▉  | 214633/272531 [32:

In [8]:
from google.colab import files
files.download("processed_tweets.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>