In [80]:
# Initial Imports

import pandas as pd
import numpy as np
import requests
import json
import datetime as dt
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Import flair pre-trained sentiment model
from flair.models import TextClassifier
classifier = TextClassifier.load('en-sentiment')
# Import flair Sentence to process input text
from flair.data import Sentence
import regex as re


pd.set_option('display.max_colwidth', 1000)

2022-01-11 23:04:04,884 loading file C:\Users\sjufa\.flair\models\sentiment-en-mix-distillbert_4.pt


In [74]:


# Making the API call to retrieve tweets 

url = "https://twitter32.p.rapidapi.com/getSearch"

# These parameters need to be set before running API call... could in the future receive inputs from user
## TODO -- if input is received, datetime will need to be formatted correctly for API call

hashtag = '$TSLA'
start_date = '2022-01-01'
end_date = '2022-01-09'

querystring = {"hashtag": hashtag, "start_date": start_date,"end_date": end_date,"lang":"en"}

headers = {
    'x-rapidapi-host': "twitter32.p.rapidapi.com",
    'x-rapidapi-key': "49e5cedc9fmsh6a2df83dacfc4c1p1c3469jsn3edb2e8cb9df"
    }

response = requests.get(url, headers=headers, params=querystring).json()
df = pd.DataFrame(response['data']['tweets']).T

In [75]:
# Parsing and formatting the dataframe to index by date created and isolating the 'full_text' and 'retweet_count' columns for sentiment analysis
df_filtered = df.loc[:,["created_at","full_text"]]
df_filtered.index = pd.to_datetime(df_filtered['created_at'])
df_filtered.index.name = 'Date'
df_filtered.drop(columns='created_at', inplace=True)

In [76]:
# We need to clean up our tweets so they can be analyzed in string format

# Create a function to clean the tweets
def clean_tweet(tweet_text): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet_text).split()) 


#applying this function to Text column of our dataframe
df_filtered["full_text"] = df_filtered["full_text"].apply(clean_tweet)

In [77]:
# Define a function to get Flair sentiment prediction score
def score_flair(text):
  sentence = Sentence(text)
  classifier.predict(sentence)
  score = sentence.labels[0].score
  value = sentence.labels[0].value
  return score, value

# Get sentiment score for each review
df_filtered['scores_flair'] = df_filtered['full_text'].apply(lambda s: score_flair(s)[0])
# Predict sentiment label for each review
df_filtered['pred_flair'] = df_filtered['full_text'].apply(lambda s: score_flair(s)[1])
# Check the distribution of the score
df_filtered['scores_flair'].describe()
# Change the label of flair prediction to 0 if negative and 1 if positive
mapping = {'NEGATIVE': 0, 'POSITIVE': 1}
df_filtered['pred_flair_numerical'] = df_filtered['pred_flair'].map(mapping)

In [78]:
#Converting the categorical variable columns into numerical with get_dummies
df_filtered[['flair_negative','flair_positive']] = pd.get_dummies(df_filtered['pred_flair'])
# Dropping the row if the scores_flair confidence level is below 0.75
df_filtered.drop(df_filtered[df_filtered.scores_flair < 0.75].index, inplace=True)
# Creating the 'df_signals' dataframe that we will use to append to our primary dataset, using only the columns with relevant numerical values, resampled by day
df_signals = df_filtered.drop(columns=['scores_flair', 'pred_flair_numerical']).resample('1D').sum()

Unnamed: 0_level_0,flair_negative,flair_positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-03 00:00:00+00:00,2,1
2022-01-04 00:00:00+00:00,2,1
2022-01-05 00:00:00+00:00,1,0
2022-01-06 00:00:00+00:00,0,2
2022-01-07 00:00:00+00:00,3,1
2022-01-08 00:00:00+00:00,3,4


### After pulling tweet data, performing sentiment analysis using flair and then filtered and resampling the dataframe, we return a dataframe with datetime indices with two columns -- flair_negative, which is the sum of all negative tweets for the day, and flair_positive, which is the sum of all positive tweets for the day -- at some point we could think about factoring in number of retweets or something along those lines to weigh the tweets by their impact, but the results from the API call already filter by this metric, so could be superfluous.

In [79]:
df_signals

Unnamed: 0_level_0,flair_negative,flair_positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-03 00:00:00+00:00,2,1
2022-01-04 00:00:00+00:00,2,1
2022-01-05 00:00:00+00:00,1,0
2022-01-06 00:00:00+00:00,0,2
2022-01-07 00:00:00+00:00,3,1
2022-01-08 00:00:00+00:00,3,4
