In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd

### Apply pre-trained model

In [127]:
# download pytorch model
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [10]:
df = pd.read_csv('../sources/data.csv')

In [5]:
df

Unnamed: 0,Datetime,Symbol,Text,Retweeted,price,30_days_price,90_days_price,180_days_price,sentiment,30_days_return,90_days_return,180_days_return
0,2023-01-25 03:05:45+00:00,MMM,To me the worst quarter of the season so far i...,,111.454300,107.800003,,,1,-0.032787,,
1,2021-11-30 14:22:02+00:00,MMM,"$MRNA, $SQ, $F, $MMM &amp; more… all covered i...",,160.991837,168.187424,142.100510,144.154526,1,0.044695,-0.117343,-0.104585
2,2020-09-24 08:36:51+00:00,MMM,If you look at the https://t.co/NS3syOn64o you...,,145.948441,151.170639,159.663681,174.247040,5,0.035781,0.093973,0.193894
3,2020-08-27 08:46:42+00:00,MMM,thank you to Mike Roman and the great folks at...,,149.278290,147.076584,162.535919,162.996353,5,-0.014749,0.088812,0.091896
4,2020-01-28 17:44:08+00:00,MMM,the stock needs to see that return.. as this i...,,146.491455,134.052170,137.167786,147.174454,1,-0.084915,-0.063647,0.004662
...,...,...,...,...,...,...,...,...,...,...,...,...
12228,2015-05-05 09:52:02+00:00,ZTS,"When you have $DIS, $CSCO and $EOG, $ZTS you a...",,43.352589,46.408531,46.009922,42.058086,1,0.070490,0.061296,-0.029860
12229,2015-05-05 07:49:28+00:00,ZTS,"Going into overdrive for $CSCO, $DIS, $ZTS, an...",,43.352577,46.408527,46.009914,42.058098,4,0.070491,0.061296,-0.029859
12230,2014-06-09 19:57:42+00:00,ZTS,Is $ATHN a $1000 stock? Is $ZTS the enemy of $...,,30.144522,30.568308,34.238846,41.699692,1,0.014058,0.135823,0.383326
12231,2013-12-03 01:45:23+00:00,ZTS,"Must Buy, $ZTS RT @codybarbo: ""Americans spend...",,29.139360,30.329683,29.112482,29.118053,5,0.040849,-0.000922,-0.000731


In [142]:
# get sentiment of each tweet
def get_sentiment(row):
    
    tokens = tokenizer.encode(row['Text'], return_tensors='pt')
    result = model(tokens)

    return int(torch.argmax(result.logits))+1

In [None]:
df['sentiment'] = df.apply(lambda row: get_sentiment(row), axis=1)

### Compute the correlation

In [151]:
# get the 30 days, 90 days, 180 days return
df['30_days_return'] =	(df['30_days_price'] - df['price']) / df['price']
df['90_days_return'] =	(df['90_days_price'] - df['price']) / df['price']
df['180_days_return'] =	(df['180_days_price'] - df['price']) / df['price']

In [3]:
# drop null data
df_dropna_30 = df.dropna(subset=['30_days_return'])
df_dropna_90 = df.dropna(subset=['90_days_return'])
df_dropna_180 = df.dropna(subset=['180_days_return'])

# compute correlation
corr1 = np.corrcoef(df_dropna_30['30_days_return'], df_dropna_30['sentiment'])
corr2 = np.corrcoef(df_dropna_90['90_days_return'], df_dropna_90['sentiment'])
corr3 = np.corrcoef(df_dropna_180['180_days_return'], df_dropna_180['sentiment'])

print(corr1)
print(corr2)
print(corr3)

[[1.         0.00231467]
 [0.00231467 1.        ]]
[[1.         0.00625331]
 [0.00625331 1.        ]]
[[ 1.         -0.00338364]
 [-0.00338364  1.        ]]


In [31]:
# calculate accuracy in terms of 30 days performance

from scipy.stats import t

# Input variables
x_bar = 0.0993	     # 20-year average annual return
s = 0.1467         # Standard deviation
n = 20           # Sample size
confidence_level = 0.95

# Compute t-value for given confidence level and degrees of freedom
d = n - 1
t_critical = abs(t.ppf((1 - confidence_level) / 2, d))

# Compute range
lower = x_bar - (t_critical * (s / np.sqrt(n)))
upper = x_bar + (t_critical * (s / np.sqrt(n)))
print([lower, upper])
true_negative = len(df[(df['sentiment'] < 3) & (df['30_days_return'] < lower/12)])
true_positive = len(df[(df['sentiment'] > 3) & (df['30_days_return'] > upper/12)])
true_neutral = len(df[(df['sentiment'] == 3) & (df['30_days_return'] <= upper/12) & (df['30_days_return'] >= lower/12)])
print((true_negative + true_positive + true_neutral) / len(df))

[0.030642286578199285, 0.16795771342180071]
0.4136352489168642


In [21]:
# calculate accuracy in terms of 90 days performance
true_negative = len(df[(df['sentiment'] < 3) & (df['90_days_return'] < lower/4)])
true_positive = len(df[(df['sentiment'] > 3) & (df['90_days_return'] > upper/4)])
true_neutral = len(df[(df['sentiment'] == 3) & (df['90_days_return'] <= upper/4) & (df['90_days_return'] >= lower/4)])
print((true_negative + true_positive + true_neutral) / len(df))

0.3898471347993133


In [22]:
# calculate accuracy in terms of 180 days performance
true_negative = len(df[(df['sentiment'] < 3) & (df['180_days_return'] < lower/2)])
true_positive = len(df[(df['sentiment'] > 3) & (df['180_days_return'] > upper/2)])
true_neutral = len(df[(df['sentiment'] == 3) & (df['180_days_return'] <= upper/2) & (df['180_days_return'] >= lower/2)])
print((true_negative + true_positive + true_neutral) / len(df))

0.36548679800539524


In [23]:
# get relevant sectors
df2 = pd.read_csv('../sources/s&p500.csv')
df2['Sector'].value_counts()

Health Care Equipment            18
Semiconductors                   15
Industrial Machinery             14
Multi-Utilities                  13
Application Software             13
                                 ..
Consumer Electronics              1
Health Care Technology            1
Leisure Products                  1
Agricultural & Farm Machinery     1
Household Appliances              1
Name: Sector, Length: 123, dtype: int64

In [24]:
# merge the original dataset and the sectors
new_df = pd.merge(df, df2, on=['Symbol','Symbol'], how='inner')

In [25]:
new_df

Unnamed: 0,Datetime,Symbol,Text,Retweeted,price,30_days_price,90_days_price,180_days_price,sentiment,30_days_return,90_days_return,180_days_return,Name,Sector
0,2023-01-25 03:05:45+00:00,MMM,To me the worst quarter of the season so far i...,,111.454300,107.800003,,,1,-0.032787,,,3M,Industrial Conglomerates
1,2021-11-30 14:22:02+00:00,MMM,"$MRNA, $SQ, $F, $MMM &amp; more… all covered i...",,160.991837,168.187424,142.100510,144.154526,1,0.044695,-0.117343,-0.104585,3M,Industrial Conglomerates
2,2020-09-24 08:36:51+00:00,MMM,If you look at the https://t.co/NS3syOn64o you...,,145.948441,151.170639,159.663681,174.247040,5,0.035781,0.093973,0.193894,3M,Industrial Conglomerates
3,2020-08-27 08:46:42+00:00,MMM,thank you to Mike Roman and the great folks at...,,149.278290,147.076584,162.535919,162.996353,5,-0.014749,0.088812,0.091896,3M,Industrial Conglomerates
4,2020-01-28 17:44:08+00:00,MMM,the stock needs to see that return.. as this i...,,146.491455,134.052170,137.167786,147.174454,1,-0.084915,-0.063647,0.004662,3M,Industrial Conglomerates
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12228,2015-05-05 09:52:02+00:00,ZTS,"When you have $DIS, $CSCO and $EOG, $ZTS you a...",,43.352589,46.408531,46.009922,42.058086,1,0.070490,0.061296,-0.029860,Zoetis,Pharmaceuticals
12229,2015-05-05 07:49:28+00:00,ZTS,"Going into overdrive for $CSCO, $DIS, $ZTS, an...",,43.352577,46.408527,46.009914,42.058098,4,0.070491,0.061296,-0.029859,Zoetis,Pharmaceuticals
12230,2014-06-09 19:57:42+00:00,ZTS,Is $ATHN a $1000 stock? Is $ZTS the enemy of $...,,30.144522,30.568308,34.238846,41.699692,1,0.014058,0.135823,0.383326,Zoetis,Pharmaceuticals
12231,2013-12-03 01:45:23+00:00,ZTS,"Must Buy, $ZTS RT @codybarbo: ""Americans spend...",,29.139360,30.329683,29.112482,29.118053,5,0.040849,-0.000922,-0.000731,Zoetis,Pharmaceuticals


In [26]:
# evaluate the accuracy sector by sector
def evaluate(month, ret, sector):
    df_sector = new_df[new_df['Sector'] == sector]
    true_negative = len(df_sector[(df_sector['sentiment'] < 3) & (df_sector[ret] < lower/month)])
    true_positive = len(df_sector[(df_sector['sentiment'] > 3) & (df_sector[ret] > upper/month)])
    true_neutral = len(df_sector[(df_sector['sentiment'] == 3) & (df_sector[ret] <= upper/month) & (df_sector[ret] >= lower/month)])
    return (true_negative + true_positive + true_neutral) / len(df_sector)

In [27]:
new_df['Sector'].value_counts().index.tolist()

['Technology Hardware, Storage & Peripherals',
 'Semiconductors',
 'Restaurants',
 'Diversified Banks',
 'Movies & Entertainment',
 'Automobile Manufacturers',
 'Application Software',
 'Internet & Direct Marketing Retail',
 'Interactive Media & Services',
 'Pharmaceuticals',
 'Oil & Gas Exploration & Production',
 'Investment Banking & Brokerage',
 'Industrial Conglomerates',
 'Hypermarkets & Super Centers',
 'Communications Equipment',
 'Biotechnology',
 'Soft Drinks',
 'Systems Software',
 'Integrated Telecommunication Services',
 'Home Improvement Retail',
 'IT Consulting & Other Services',
 'Aerospace & Defense',
 'Data Processing & Outsourced Services',
 'Apparel, Accessories & Luxury Goods',
 'Packaged Foods & Meats',
 'Airlines',
 'Health Care Equipment',
 'Construction Machinery & Heavy Trucks',
 'General Merchandise Stores',
 'Specialty Chemicals',
 'Integrated Oil & Gas',
 'Oil & Gas Equipment & Services',
 'Specialty Stores',
 'Industrial Machinery',
 'Personal Products',
 

In [28]:
for sec in new_df['Sector'].value_counts().index.tolist():
    print(sec, evaluate(12, '30_days_return', sec))

Technology Hardware, Storage & Peripherals 0.4286632390745501
Semiconductors 0.39325842696629215
Restaurants 0.44110854503464203
Diversified Banks 0.38491048593350385
Movies & Entertainment 0.44468085106382976
Automobile Manufacturers 0.45751633986928103
Application Software 0.42923433874709976
Internet & Direct Marketing Retail 0.4127906976744186
Interactive Media & Services 0.24269005847953215
Pharmaceuticals 0.4045307443365696
Oil & Gas Exploration & Production 0.43359375
Investment Banking & Brokerage 0.42063492063492064
Industrial Conglomerates 0.45454545454545453
Hypermarkets & Super Centers 0.42792792792792794
Communications Equipment 0.47572815533980584
Biotechnology 0.4682926829268293
Soft Drinks 0.4411764705882353
Systems Software 0.35294117647058826
Integrated Telecommunication Services 0.37948717948717947
Home Improvement Retail 0.43859649122807015
IT Consulting & Other Services 0.3803680981595092
Aerospace & Defense 0.391025641025641
Data Processing & Outsourced Services 0

In [29]:
df.to_csv('../sources/data.csv', index=False)

In [30]:
df = pd.read_csv('../sources/data.csv')
df

Unnamed: 0,Datetime,Symbol,Text,Retweeted,price,30_days_price,90_days_price,180_days_price,sentiment,30_days_return,90_days_return,180_days_return
0,2023-01-25 03:05:45+00:00,MMM,To me the worst quarter of the season so far i...,,111.454300,107.800003,,,1,-0.032787,,
1,2021-11-30 14:22:02+00:00,MMM,"$MRNA, $SQ, $F, $MMM &amp; more… all covered i...",,160.991837,168.187424,142.100510,144.154526,1,0.044695,-0.117343,-0.104585
2,2020-09-24 08:36:51+00:00,MMM,If you look at the https://t.co/NS3syOn64o you...,,145.948441,151.170639,159.663681,174.247040,5,0.035781,0.093973,0.193894
3,2020-08-27 08:46:42+00:00,MMM,thank you to Mike Roman and the great folks at...,,149.278290,147.076584,162.535919,162.996353,5,-0.014749,0.088812,0.091896
4,2020-01-28 17:44:08+00:00,MMM,the stock needs to see that return.. as this i...,,146.491455,134.052170,137.167786,147.174454,1,-0.084915,-0.063647,0.004662
...,...,...,...,...,...,...,...,...,...,...,...,...
12228,2015-05-05 09:52:02+00:00,ZTS,"When you have $DIS, $CSCO and $EOG, $ZTS you a...",,43.352589,46.408531,46.009922,42.058086,1,0.070490,0.061296,-0.029860
12229,2015-05-05 07:49:28+00:00,ZTS,"Going into overdrive for $CSCO, $DIS, $ZTS, an...",,43.352577,46.408527,46.009914,42.058098,4,0.070491,0.061296,-0.029859
12230,2014-06-09 19:57:42+00:00,ZTS,Is $ATHN a $1000 stock? Is $ZTS the enemy of $...,,30.144522,30.568308,34.238846,41.699692,1,0.014058,0.135823,0.383326
12231,2013-12-03 01:45:23+00:00,ZTS,"Must Buy, $ZTS RT @codybarbo: ""Americans spend...",,29.139360,30.329683,29.112482,29.118053,5,0.040849,-0.000922,-0.000731
