In [2]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.13.0-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.8/102.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tweepy
Successfully installed tweepy-4.13.0
[0m

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')

import re
import tweepy
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax

import warnings
warnings.filterwarnings('ignore')

---
### HELPER FUNCS
---

In [4]:
# get Twitter API keys from .txt
def get_keys(path):
    with open(path, 'r') as api_keys:
        data = api_keys.read().replace('\n', ' ')
        keys = data.split(' ')
        keys.pop()
    return keys

# Shaping the dataflow
def get_topic(topic, api_key_list):
    authenticate = tweepy.OAuthHandler(api_key_list[0], api_key_list[1])
    authenticate.set_access_token(api_key_list[2], api_key_list[3])
    api = tweepy.API(authenticate, wait_on_rate_limit=True)

    raw_data = api.user_timeline(screen_name=topic,
                                 count=100, lang='en',
                                 tweet_mode='extended')
    return raw_data

# tendency: if retweets > likes, people are retweeting to counter the person diffusing misinformation 
def clean_fake_news(df):
    df['boolean_indexed'] = df['retweets'] < df['likes']
    df = df[df['boolean_indexed'] == True]
    df = df.drop('boolean_indexed', axis=1)
    return df

# ___________________________________________________________________________________________

# cleaning the text
def clean_txt(texte):
    texte = str(texte)
    texte = re.sub('@[A-Za-z0-9]+', '', texte)      # remove @mentions
    texte = re.sub('#', '', texte)                  # removes #hashtags
    texte = re.sub('RT[\s]+', '', texte)            # removes retweets
    texte = re.sub('rt :', '', texte)               # removes retweets
    texte = re.sub('https?:\/\/\S+', '', texte)     # removs hyperlinks https
    texte = re.sub('\n', '', texte)
    return texte

# from dict to dataframe
def to_dataframe(raw_data):
    df = pd.DataFrame([tweet.full_text for tweet in raw_data], columns=['tweets'])
    df['tweets'] = df['tweets'].str.lower()
    df['tweets'] = df['tweets'].apply(clean_txt)
    df['id'] = np.array([tweet.id for tweet in raw_data])
    df['len'] = np.array([len(tweet.full_text) for tweet in raw_data])
    df['date'] = np.array([tweet.created_at for tweet in raw_data])
    df['source'] = np.array([tweet.source for tweet in raw_data])
    df['likes'] = np.array([tweet.favorite_count for tweet in raw_data])
    df['retweets'] = np.array([tweet.retweet_count for tweet in raw_data])
    return df

# ___________________________________________________________________________________________

# from raw sentiment analysis to readable 
def roberta_polarity_scores(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scoring = output[0][0].detach().numpy()
    scoring = softmax(scoring)
    scoring_dict = {
        'rob_neg': scoring[0],
        'rob_neu': scoring[1],
        'rob_pos': scoring[2],
    }
    return scoring_dict


---
### CODE INIT
---

In [5]:
keys = get_keys('/kaggle/input/apikeys/api_keys.txt')
df_raw = get_topic('Tesla', keys)
tdf = to_dataframe(df_raw)
tdf = tdf.sort_values(by='likes', ascending=False)
tdf = tdf.reset_index()
tdf = tdf.drop('index', axis=1)
tdf

Unnamed: 0,tweets,id,len,date,source,likes,retweets
0,"our next gigafactory will be in mexico, manufa...",1631087437723172868,98,2023-03-02 00:22:03+00:00,Twitter Web App,77643,11751
1,ultra red,1636446199296270336,33,2023-03-16 19:15:52+00:00,Twitter for iPhone,60457,4458
2,cybertruck crash test,1642162058258001920,45,2023-04-01 13:48:39+00:00,Twitter for iPhone,56903,5919
3,we produced our 4 millionth vehicle at giga te...,1631075083018989568,81,2023-03-01 23:32:58+00:00,Twitter for iPhone,50522,4814
4,megapack factory,1621627317301776387,40,2023-02-03 21:50:55+00:00,Twitter Web App,45801,5144
...,...,...,...,...,...,...,...
94,tesla powerwall enables your home to operate ...,1638419628220362752,113,2023-03-22 05:57:34+00:00,Twitter for iPhone,0,9663
95,rt _europe: 🛝 at giga berlin 😎,1637040338543017991,60,2023-03-18 10:36:46+00:00,Twitter for iPhone,0,2388
96,rt _europe: regenerative braking turns our mot...,1635377476087197701,140,2023-03-13 20:29:08+00:00,Twitter for iPhone,0,2038
97,rt _europe: 4k model y built at giga berlin th...,1630061580929966080,114,2023-02-27 04:25:40+00:00,Twitter for iPhone,0,1566


---
# Training models
> ## Roberta


In [6]:
preMODEL = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(preMODEL)
model = AutoModelForSequenceClassification.from_pretrained(preMODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [7]:
example = tdf.loc[0, 'tweets']
example

'our next gigafactory will be in mexico, manufacturing our next-gen vehicle '

In [8]:
encoded = tokenizer(example, return_tensors='pt')
output = model(**encoded)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

print(example)
print(f"Roberta scores:\n negative => {scores[0]}\n neutral => {scores[1]}\n positive => {scores[2]}")

our next gigafactory will be in mexico, manufacturing our next-gen vehicle 
Roberta scores:
 negative => 0.003147152718156576
 neutral => 0.6670892834663391
 positive => 0.3297637104988098


In [9]:
success_example = roberta_polarity_scores(tdf.loc[10, 'tweets'])
print(tdf.loc[33, 'tweets'])
success_example

over the last 5 years, we've unlocked 30% faster charge times through a combination of hardware, software &amp; customer education 


{'rob_neg': 0.0031614022, 'rob_neu': 0.22306776, 'rob_pos': 0.7737708}

In [10]:
resp = {}
for i, row in tqdm(tdf.iterrows(), total=len(tdf)):
    try: 
        text = row['tweets']
        ids = row['id']
        res_rob = roberta_polarity_scores(text)
        resp[ids] = res_rob
    except RuntimeError:
        print(f"Comment with id {ids} can't be treated: sentence too long to handle")

  0%|          | 0/99 [00:00<?, ?it/s]

In [11]:
resp_df = pd.DataFrame(resp)
resp_df = resp_df.T
resp_df = resp_df.reset_index()
resp_df = resp_df.rename(columns={'index': 'id'})
resp_df

Unnamed: 0,id,rob_neg,rob_neu,rob_pos
0,1631087437723172868,0.003147,0.667089,0.329764
1,1636446199296270336,0.088210,0.700722,0.211068
2,1642162058258001920,0.197426,0.739896,0.062678
3,1631075083018989568,0.001966,0.342386,0.655649
4,1621627317301776387,0.159332,0.727013,0.113656
...,...,...,...,...
94,1638419628220362752,0.024268,0.444094,0.531637
95,1637040338543017991,0.007926,0.724404,0.267670
96,1635377476087197701,0.014097,0.695834,0.290068
97,1630061580929966080,0.000773,0.022188,0.977039


In [12]:
fdf = tdf.merge(resp_df, how='left')
fdf

Unnamed: 0,tweets,id,len,date,source,likes,retweets,rob_neg,rob_neu,rob_pos
0,"our next gigafactory will be in mexico, manufa...",1631087437723172868,98,2023-03-02 00:22:03+00:00,Twitter Web App,77643,11751,0.003147,0.667089,0.329764
1,ultra red,1636446199296270336,33,2023-03-16 19:15:52+00:00,Twitter for iPhone,60457,4458,0.088210,0.700722,0.211068
2,cybertruck crash test,1642162058258001920,45,2023-04-01 13:48:39+00:00,Twitter for iPhone,56903,5919,0.197426,0.739896,0.062678
3,we produced our 4 millionth vehicle at giga te...,1631075083018989568,81,2023-03-01 23:32:58+00:00,Twitter for iPhone,50522,4814,0.001966,0.342386,0.655649
4,megapack factory,1621627317301776387,40,2023-02-03 21:50:55+00:00,Twitter Web App,45801,5144,0.159332,0.727013,0.113656
...,...,...,...,...,...,...,...,...,...,...
94,tesla powerwall enables your home to operate ...,1638419628220362752,113,2023-03-22 05:57:34+00:00,Twitter for iPhone,0,9663,0.024268,0.444094,0.531637
95,rt _europe: 🛝 at giga berlin 😎,1637040338543017991,60,2023-03-18 10:36:46+00:00,Twitter for iPhone,0,2388,0.007926,0.724404,0.267670
96,rt _europe: regenerative braking turns our mot...,1635377476087197701,140,2023-03-13 20:29:08+00:00,Twitter for iPhone,0,2038,0.014097,0.695834,0.290068
97,rt _europe: 4k model y built at giga berlin th...,1630061580929966080,114,2023-02-27 04:25:40+00:00,Twitter for iPhone,0,1566,0.000773,0.022188,0.977039


# Training models
> ## Modern transformer

In [13]:
modelli = "nlptown/bert-base-multilingual-uncased-sentiment"

sent_pipeline = pipeline("sentiment-analysis", model=modelli)
sent_pipeline = pipeline("sentiment-analysis")
example_phrase = fdf.loc[0, 'tweets']

print(example_phrase)
res = sent_pipeline(example_phrase)


Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

our next gigafactory will be in mexico, manufacturing our next-gen vehicle 


In [14]:
for i in range(len(fdf['tweets'])):
    fdf.loc[i, 'trans_ccl'] = sent_pipeline(fdf.loc[i, 'tweets'])[0]['label']
    fdf.loc[i, 'trans_score'] = sent_pipeline(fdf.loc[i, 'tweets'])[0]['score'] 
fdf    

Unnamed: 0,tweets,id,len,date,source,likes,retweets,rob_neg,rob_neu,rob_pos,trans_ccl,trans_score
0,"our next gigafactory will be in mexico, manufa...",1631087437723172868,98,2023-03-02 00:22:03+00:00,Twitter Web App,77643,11751,0.003147,0.667089,0.329764,NEGATIVE,0.977939
1,ultra red,1636446199296270336,33,2023-03-16 19:15:52+00:00,Twitter for iPhone,60457,4458,0.088210,0.700722,0.211068,POSITIVE,0.895116
2,cybertruck crash test,1642162058258001920,45,2023-04-01 13:48:39+00:00,Twitter for iPhone,56903,5919,0.197426,0.739896,0.062678,NEGATIVE,0.996901
3,we produced our 4 millionth vehicle at giga te...,1631075083018989568,81,2023-03-01 23:32:58+00:00,Twitter for iPhone,50522,4814,0.001966,0.342386,0.655649,POSITIVE,0.982531
4,megapack factory,1621627317301776387,40,2023-02-03 21:50:55+00:00,Twitter Web App,45801,5144,0.159332,0.727013,0.113656,NEGATIVE,0.991843
...,...,...,...,...,...,...,...,...,...,...,...,...
94,tesla powerwall enables your home to operate ...,1638419628220362752,113,2023-03-22 05:57:34+00:00,Twitter for iPhone,0,9663,0.024268,0.444094,0.531637,POSITIVE,0.773378
95,rt _europe: 🛝 at giga berlin 😎,1637040338543017991,60,2023-03-18 10:36:46+00:00,Twitter for iPhone,0,2388,0.007926,0.724404,0.267670,POSITIVE,0.910959
96,rt _europe: regenerative braking turns our mot...,1635377476087197701,140,2023-03-13 20:29:08+00:00,Twitter for iPhone,0,2038,0.014097,0.695834,0.290068,POSITIVE,0.893449
97,rt _europe: 4k model y built at giga berlin th...,1630061580929966080,114,2023-02-27 04:25:40+00:00,Twitter for iPhone,0,1566,0.000773,0.022188,0.977039,POSITIVE,0.540109


In [15]:
fdf.loc[:, 'trans_ccl'].value_counts()

POSITIVE    58
NEGATIVE    41
Name: trans_ccl, dtype: int64

---
## Conclusion
> ### The problem 
The transformer provided by Hugging Face is powerful enough to give rightful scores and give conclusions about the tweets, but it still doesn't understand 2nd degree phrases and sometimes it may cause a problem for twitter data mining. However, in other sources like Bloomberg, yfinance or Quandl/Nasdaq, since they are source providers, there is less chance to get 2nd degree expressions, and that's why the coef of Twitter will be significantly lower than our other Sentiment Analysis bots: SAFQTF, SAFBTF...
Another point is that The final transformer can't give other outputs than Positive or Negative: there is no neutral form, which can also drive to confusion. 
    
> ### Further steps
There is eventually certain ways to engineer a neutral form of sentiment: it is indeed possible to put a threshold (e.g. neg < 0.15 or pos < 0.15) and consider all values in that interval neutral. 
On the other hand, fundamental sources (Reuters, BBLP, YF) provide data and explanation of phenomena that will give more based opinions about market movements compared to twitter, which is more like an alert or group power revandciation. Next step will eventually be to work with Reuters & YF APIs.