# Import Statements
---

In [1]:

import pandas as pd                 # Pandas dataframe library
import pandas_datareader as pdr     # Pandas datareader that allows me to lookup & store live crypto prices from yahoo finance.
import numpy as np                  # Numpy
import matplotlib.pyplot as pypl    # Pyplot used to create visuals/graphics based on data 
from alpha_vantage.timeseries import TimeSeries     # Library used for pulling live price data from alphavantage api

from datetime import datetime, timedelta, timezone             # Datetime library.
import warnings
warnings.simplefilter(action='ignore', category=ResourceWarning)    # Suppresses warnings to limit size of output cells.
warnings.filterwarnings('ignore')

import glob                         # For changing/finding proper directory
import os                           # For changing/finding proper directory (when opening files)
import requests                     # For sending HTTP requests in order to hit necessary API endpoints.
import twint                        # Twitter web scraping tool with more features than the regular twitter API
import nest_asyncio                 # Import required for twint usage, allows for the use of asynchronous computing
nest_asyncio.apply()                

import re                           # Regex for string cleaning (used for Textblob Sentiment Analysis)
from textblob import TextBlob       # Textblob used for sentiment analysis of cleaned data.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer    # Sentiment analysis tool that works great on determining social media sentiment.

from sklearn.metrics import accuracy_score, classification_report       
from sklearn.model_selection import train_test_split                    # Used for splitting data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis    # Used for implementing LDA

os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\archive')    # Directory where the stopwords.txt file is located. 
stopwords_file = open("stopwords.txt", "r+")                                    # This file is used for sifting the tweets fetched before feeding them into Textblob for Sentiment Analysis
stopwords = list(stopwords_file.read().split('\n'))

## Reading in crypto price dataset
---
Section below reads csv files into pandas dataframes for interacting with. Also compiles list of coin names for twitter searching.

In [3]:
path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\prices\HourlyPrices'
extension = 'csv'
os.chdir(path)
hourly_csv_files = glob.glob('*.{}'.format(extension))

# Compile list of all coin names for searching on twitter later
hourly_coins = []

for coin in hourly_csv_files:
    vals = coin.split("_")
    coin_name = vals[0]
    hourly_coins.append(coin_name)

# compile list of pandas dataframes for use later.
hourly_coin_data = []

for file in hourly_csv_files:
    df = pd.read_csv(file)
    hourly_coin_data.append(df)


### Just to give you an idea of what this looks like...

In [4]:
hourly_coin_data[4].head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume ETH,Volume USD
0,1652313600,2022-05-12 00:00:00,ETH/USD,2082.43,2140.29,2069.94,2132.3,289.403694,617095.5
1,1652310000,2022-05-11 23:00:00,ETH/USD,2075.36,2096.75,2044.29,2075.77,386.826725,802963.3
2,1652306400,2022-05-11 22:00:00,ETH/USD,2088.41,2113.99,2037.41,2075.96,1234.586719,2562953.0
3,1652302800,2022-05-11 21:00:00,ETH/USD,2039.12,2099.83,1997.73,2088.98,4759.196164,9941866.0
4,1652299200,2022-05-11 20:00:00,ETH/USD,2113.18,2147.74,2001.0,2034.05,3188.877422,6486336.0


## Scrape Twitter for data on all coins supplied by dataset
---
Below section of code searches through Twint tweet database for any tweets associated 
with the each of the provided cryptocurrency acronyms. 

In [9]:
# Function for iterating through coins list and storing findings in .csv files
def search_coins(coins):
    
    for coin in coins:
        path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results'
        os.chdir(path)
        os.chdir(coin)
        
        print('performing twitter search for coin:', coin)
        
        from_date = '2022-05-08'
        to_date = '2022-05-10'
        print(f'searching {from_date} to {to_date}')
        
        c = twint.Config()
        c.Limit = 100
        c.Lang = "en"
        c.Pandas = True
        c.Search = coin
        c.Hide_output = True
        c.Since = from_date
        c.Until = to_date
        c.Store_csv = True
        c.Output = coin + '_' + from_date + '_' + to_date + '_search_result.csv'
        twint.run.Search(c)
search_coins(hourly_coins)

performing twitter search for coin: AAVE
searching 2022-05-08 to 2022-05-10
performing twitter search for coin: AVAX
searching 2022-05-08 to 2022-05-10
performing twitter search for coin: BCH
searching 2022-05-08 to 2022-05-10
performing twitter search for coin: BTC
searching 2022-05-08 to 2022-05-10
performing twitter search for coin: ETH
searching 2022-05-08 to 2022-05-10


### Below chunk is where the data pre-processing begins.
---
**Purpose:**
* I need to modify the dataframe so that it contains both the price information, as well as all of the tweets so I can easily perform sentiment analysis on them using VADER Sentiment Analysis & Textblob.

The two functions below are used for the following: 
* The *sift_tweet* function the tweets for textblob, as that tool will be providing us with the **subjectivity** and **polarity** of the tweets we've scraped, but requires there to be no unnecessary characters (emojis, hashtags, links, etc.). 
* The *get_sentiment* function will run the base tweet through the Vader Sentiment Intensity Analyzer to derive the **compound, positive, negative**, and **neutral** values for the text provided. 
* *NOTE:* VADER is deisgned to be able to accept and analyze text taken from an online space, meaning it knows how to interpret emojis, hashtags and slang to an extent. As a result, the tweets fed in here are **not** sifted like the above tweets.

In [2]:
# Need to create function for cleaning the tweets so we can derive the subjectivity and polarity using textblob.
def sift_tweet(tweet, stop_words):
    cleaned_tweet = tweet
    cleaned_tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet) # regex to remove all @userame, emojis, and links from tweets.
    for word in cleaned_tweet:
        if word in stop_words: cleaned_tweet.replace(word, '')
    return cleaned_tweet

# Function for allowing me to generate the sentiment intensity of the text passed in.
def get_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

In the cell below there are a few important things that happen:

* I iterate through the list of coins we have hourly price data for and comprise dataframes for each coin with its respective tweets from the search results.
* From there the I go coin by coin, creating the desired shape of the dataframe, then filling in all appropriate values. These include:
  * breaking the tweets up by hour and sorting them with their appropriate time window
  * cleaning the tweets using the sift tweet function
  * Running the sentiment analysis on the tweets and storing those values in their corresponding cells.


In [94]:
print(hourly_coin_data[4]['date'])

0      2022-05-12 00:00:00
1      2022-05-11 23:00:00
2      2022-05-11 22:00:00
3      2022-05-11 21:00:00
4      2022-05-11 20:00:00
              ...         
980    2022-04-01 04:00:00
981    2022-04-01 03:00:00
982    2022-04-01 02:00:00
983    2022-04-01 01:00:00
984    2022-04-01 00:00:00
Name: date, Length: 985, dtype: object


In [95]:
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results')
tweet_dfs = []
grouped_tweets = []

# Read Tweets into a DF from the CSVs
for coin in hourly_coins:
    
    os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results')
    os.chdir(coin)
    csv_names = glob.glob('*.{}'.format(extension))
    coin_pds = []
    for file in csv_names:
        tweet_pd = pd.read_csv(file)
        tweet_pd.sort_values(by='created_at')
        coin_pds.append(tweet_pd)
    tweet_dfs.append(coin_pds)


# This is just so i can the data i need to train a model for aave and avax. I'll do all 5 when i want to showcase something but for now i only need these 2.
for i in range(len(tweet_dfs)):
    print('lookin at coin number:', i)
    hourly_coin_data[i]['date'] = pd.to_datetime(hourly_coin_data[i]['date'])
    hourly_coin_data[i]['joined_tweets'] = ""
    hourly_coin_data[i]['compound'] = 0.0
    hourly_coin_data[i]['positive'] = 0.0
    hourly_coin_data[i]['negative'] = 0.0
    hourly_coin_data[i]['neutral'] = 0.0

    #print(hourly_coin_data[i])
    for j in range(len(tweet_dfs[i])):
        tweet_dfs[i][j]['created_at'] = tweet_dfs[i][j]['created_at'].str.replace(" Pacific Daylight Time","").str.strip()
        tweet_dfs[i][j]['created_at'] = pd.to_datetime(tweet_dfs[i][j]['created_at'])

        for day in range(1,31):
            #print('checking day:', day)
            for hour in range(24):
                tweet_time_mask = (tweet_dfs[i][j]['created_at'].dt.hour >= hour) & (tweet_dfs[i][j]['created_at'].dt.hour < hour + 1) & \
                            (tweet_dfs[i][j]['created_at'].dt.day >= day ) & (tweet_dfs[i][j]['created_at'].dt.day < day + 1)
                price_time_mask = (hourly_coin_data[i]['date'].dt.hour >= hour) & (hourly_coin_data[i]['date'].dt.hour < hour + 1) & \
                            (hourly_coin_data[i]['date'].dt.day >= day ) & (hourly_coin_data[i]['date'].dt.day < day + 1)

                hour_view = tweet_dfs[i][j][tweet_time_mask]
                if hour_view.empty:
                    continue
                
                hour_view['cleaned_tweet'] = hour_view['tweet'].apply(lambda x: sift_tweet(str(x).lower(), stopwords))

                joined_tweets = ' '.join(hour_view['tweet'])
                joined_clean_tweets = ' '.join(hour_view['cleaned_tweet'])

                SIA = get_sentiment(joined_tweets)
                compound = SIA['compound']                    # Score representing sum(lexicon ratings)
                pos = SIA['pos']
                neg = SIA['neg']
                neu = SIA['neu']

                index = hourly_coin_data[i][price_time_mask].index
                for ind in index:
                    hourly_coin_data[i].at[ind,'joined_tweets'] = joined_tweets
                    hourly_coin_data[i].at[ind,'polarity'] = TextBlob(joined_clean_tweets).sentiment[0]                # Analyze and store Polarity value in Coin Dataframe using Textblob
                    hourly_coin_data[i].at[ind,'subjectivity'] = TextBlob(joined_clean_tweets).sentiment[1]            # Analyze and store Subjectivity value in Coin Dataframe using Textblob
                    hourly_coin_data[i].at[ind,'compound'] = compound
                    hourly_coin_data[i].at[ind,'positive'] = pos
                    hourly_coin_data[i].at[ind,'negative'] = neg
                    hourly_coin_data[i].at[ind,'neutral'] = neu


lookin at coin number: 0
lookin at coin number: 1
lookin at coin number: 2
lookin at coin number: 3


KeyboardInterrupt: 

Dropping the rows that don't contain a polarity score. The only reason they wouldn't have this would be because they didn't have any tweets stored in their row for that hour.

In [96]:
for i in range(len(hourly_coin_data)):
    hourly_coin_data[i] = hourly_coin_data[i][hourly_coin_data[i]['polarity'].notna()]

KeyError: 'polarity'

Now, I have to iterate through remaining rows and append a price change label. This label signifies whether or not in that hour the price of the coin went up or down. This is what the model is going to be responsible for predicting.

In [97]:
for i in range(len(hourly_coin_data)):
    hourly_coin_data[i].reset_index()
    hourly_coin_data[i]['price_change'] = np.nan
    for index, row in hourly_coin_data[i].iterrows():
        if row.open > row.close:
            hourly_coin_data[i].at[index, 'price_change'] = 0
        else:
            hourly_coin_data[i].at[index, 'price_change'] = 1

In [98]:
hourly_coin_data[0].head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume AAVE,Volume USD,joined_tweets,compound,positive,negative,neutral,polarity,subjectivity,price_change
32,1652198400,2022-05-10 16:00:00,AAVE/USD,110.46,110.52,108.8,109.42,189.899713,20778.826552,@CUPIDJJK7 she uses aave &amp; she isn’t black...,-0.9995,0.074,0.099,0.827,-0.022206,0.438997,0.0
33,1652194800,2022-05-10 15:00:00,AAVE/USD,109.72,110.44,107.95,110.44,169.152478,18681.199703,Everyone unfollow @parkjiminswhore she misuses...,-0.9927,0.075,0.087,0.838,0.035348,0.456644,1.0
34,1652191200,2022-05-10 14:00:00,AAVE/USD,113.36,113.36,110.25,110.25,123.361198,13600.572061,oh well aave stands for african american verna...,0.9949,0.084,0.064,0.853,0.019751,0.444341,0.0
35,1652187600,2022-05-10 13:00:00,AAVE/USD,117.0,117.0,112.54,112.8,61.590583,6947.417713,@SouISon 😩they don’t gotta try so hard. aave b...,0.9998,0.09,0.05,0.86,0.06037,0.452902,0.0
36,1652184000,2022-05-10 12:00:00,AAVE/USD,114.04,116.67,114.04,115.19,341.841648,39376.739394,AAVE / USDT - #aaveusdt #aave AAVE şu anda t...,0.9984,0.111,0.046,0.843,0.081142,0.42304,1.0


In [42]:
hourly_coin_data[2].shape

(601, 17)

## Prediction Implementation: LDA With Sentiment Analysis
---
Why I'm using LDA:
* I figured instead of using an LSTMRNN (which had very poor performance) I could try twisting the problem and using a classification model instead.
* The goal now is to form the data into a format which can allow the model to make a prediction based on a label describing whether or not it believes the price will increase/decrease over the next hour.
* Uses Naive Bayes to determine what it should be classified as (increasing/decreasing), then we can display that probability in our front-end.

# IMPORTANT:
---
* If you're running each cell in the jupyter notebook, you only need to run the below code cell. 

* If you're going to try to use the exported model_df_#.csv files that are saved in the hourly_coin_data directory, you need to run the 2nd cell below.

In [99]:
# These are all the columns we actually want to keep for the purposes of training & using the model.
model_cols = ['open', 'high', 'low', 'Volume USD', 'compound', 'positive', 'negative', 'neutral', 'polarity', 'subjectivity', 'price_change']
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\hourly_coin_data')

#for i in range(1):
for i in range(3):
    model_df = hourly_coin_data[i][model_cols]
    model_df.to_csv(f'model_df_{i}.csv')

    # Feature Dataset
    x = model_df
    # Target Dataset
    y = np.array(model_df['price_change'])
    x.drop(['price_change'], axis=1, inplace=True)
    np.asarray(x)
    
    # split into test & train
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    # Create LDA model
    model = LinearDiscriminantAnalysis().fit(x_train, y_train)
    predictions = model.predict(x_test)
    print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.88      0.39      0.54        18
         1.0       0.77      0.97      0.86        37

    accuracy                           0.78        55
   macro avg       0.82      0.68      0.70        55
weighted avg       0.80      0.78      0.75        55

              precision    recall  f1-score   support

         0.0       0.83      0.91      0.87        11
         1.0       0.86      0.75      0.80         8

    accuracy                           0.84        19
   macro avg       0.85      0.83      0.83        19
weighted avg       0.84      0.84      0.84        19

              precision    recall  f1-score   support

         0.0       0.89      0.81      0.85        31
         1.0       0.70      0.82      0.76        17

    accuracy                           0.81        48
   macro avg       0.80      0.81      0.80        48
weighted avg       0.82      0.81      0.82        48



*NOTE:* As stated above the below codeblock is mainly for testing purposes. It allows me to read in the previously pre-processed and formatted data for ease of use and reduces the wait time required for sentiment analysis TREMENDOUSLY.

In [3]:
# These are all the columns we actually want to keep for the purposes of training & using the model.
model_cols = ['open', 'high', 'low', 'Volume USD', 'compound', 'positive', 'negative', 'neutral', 'polarity', 'subjectivity', 'price_change']
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\hourly_coin_data')

model_df = pd.read_csv('model_df_2.csv')
model_df = model_df.iloc[: , 1:]                # Drops first column in the dataframe as we don't want/need it.

# Feature Dataset
x = model_df
# Target Dataset
y = np.array(model_df['price_change'])
x.drop(['price_change'], axis=1, inplace=True)
np.asarray(x)


# split into test & train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


# Create LDA model
model = LinearDiscriminantAnalysis().fit(x_train, y_train)
predictions = model.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.89      0.81      0.85        31
         1.0       0.70      0.82      0.76        17

    accuracy                           0.81        48
   macro avg       0.80      0.81      0.80        48
weighted avg       0.82      0.81      0.82        48



# Pull data from the last hour to make prediction
---
## Data I need:
* Price by hour data for the currencies the model was trained on
* Tweets for the last hour about that currency


Below functions are responsible for:
* Sending URL Request/hitting endpoint for alphavantage (pulls live crypto price data)
* Hitting Twitter API endpoint for pulling tweets.


In [4]:
def send_request(url, headers, params, next_token=None):
    params['next_token'] = next_token
    response = requests.request('GET', url, headers=headers, params=params)
    print('Endpoint response code:' + str(response.status_code))
    if (response.status_code != 200):
        raise Exception(response.status_code, response.text)
    return response.json()

def pull_live_tweets(coin):

    # Pull tweets from the last hour
    path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\predicted_trends'
    os.chdir(path)

    print('performing twitter search for coin:', coin)

    # 1 hour ago
    from_date = datetime.now(timezone.utc) - timedelta(hours = 1)
    to_date = datetime.now(timezone.utc) + timedelta(seconds=-30)
    
    iso_from_date = from_date.isoformat()
    iso_to_date = to_date.isoformat()

    from_date = from_date.strftime('%Y-%m-%d %H:%M:%S')
    to_date = to_date.strftime('%Y-%m-%d %H:%M:%S')

    print(f'searching {from_date} to {to_date}')
    
    bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJwBbgEAAAAAyi3tWb4jDN72EZqz6dcWgOIizuc%3DsC3xrWGrxPCwiKwqy2fINUgJDs2qKaZNlITIIy75Pss1oiMeTN'

    headers = {
        "Authorization": "Bearer {}".format(bearer_token)
    }

    url = 'https://api.twitter.com/2/tweets/search/recent'

    params = {
        'query': coin,
        'start_time': iso_from_date,
        'end_time': iso_to_date,
        'max_results': 100,
        'next_token':{}
    }

    json_response = send_request(url, headers, params)
    return json_response

# Pull tweets on topic from last 30 minutes
fetched_tweets = pull_live_tweets('AVAX lang:en')
fetched_tweets_df = pd.DataFrame(fetched_tweets['data'])
fetched_tweets_df.to_csv('recently_fetched_tweets.csv')

performing twitter search for coin: AVAX lang:en
searching 2022-05-13 03:42:05 to 2022-05-13 04:41:35
Endpoint response code:200


In [5]:
# Uses AlphaVantage API with their CRYPTO_INTRADAY endpoint.

av_api_key = 'GD982KLZ6PZ69GQ0'
path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\prices\LivePrices'
os.chdir(path)

def get_prices(coin):
    url = f'https://www.alphavantage.co/query?function=CRYPTO_INTRADAY&symbol={coin}&market=USD&interval=1min&apikey={av_api_key}&datatype=csv'
    req = requests.get(url)
    data = req.content
    csv_file = open(f'{coin}_prices.csv','wb')
    csv_file.write(data)
    csv_file.close()
    return

get_prices('AVAX')  # Get the prices from the specified coin

# format that data into a dataframe
live_prices = pd.read_csv('AVAX_prices.csv')    # read in live prices csv
kept_prices = live_prices.head(60)              # keep only the last 60 minutes.
high = kept_prices['high'].max(axis=0)       # Find the max value in the last 60 minutes
low = kept_prices['low'].min(axis=0)        # find the lowesst value in the last 60 minutes
open_price = kept_prices['open'].values[59]                 # Price from 60 minutes ago. (opening price of the last hour) 
volume = kept_prices['volume'].sum(axis=0)      # summate the total volume traded from the last hour

live_coin_data = pd.DataFrame([[open_price, high, low, volume]], columns =['open', 'high', 'low', 'volume'])

# Run textblob on tweets for polarity & subjectivity
combined_tweets = ' '.join(fetched_tweets_df['text'])

# Clean tweet so we can use textblob on it.
fetched_tweets_df['cleaned_tweet'] = fetched_tweets_df['text'].apply(lambda x: sift_tweet(str(x).lower(), stopwords))
combined_cleaned_tweets = ' '.join(fetched_tweets_df['cleaned_tweet'])

# Get sentiment values on tweets using VADER sentiment analyzer
sia = get_sentiment(combined_tweets)
compound = sia['compound']                    # Score representing sum(lexicon ratings)
pos = sia['pos']
neg = sia['neg']
neu = sia['neu']

live_coin_data.loc[live_coin_data.index[0],'compound'] = compound
live_coin_data.loc[live_coin_data.index[0],'pos'] = pos
live_coin_data.loc[live_coin_data.index[0],'neg'] = neg
live_coin_data.loc[live_coin_data.index[0],'neu'] = neu
live_coin_data.loc[live_coin_data.index[0],'polarity'] = TextBlob(combined_cleaned_tweets).sentiment[0]            
live_coin_data.loc[live_coin_data.index[0],'subjectivity'] = TextBlob(combined_cleaned_tweets).sentiment[1]

# make the prediction
model.predict(live_coin_data)
prob = model.predict_proba(live_coin_data)

print(prob[0])

[0.84372502 0.15627498]


In [29]:
print(prob[0])

[0.84209867 0.15790133]


And that's it! The above cell outputs the prediction from the model. 

* First # signifies probability of a decrease in price
* Second # signifiese probability of an increase in price