# Import Statements
---
**Important note:**
For some reason tensorflow version and numpy version have dependency conflicts. Need to figure out what version is stable for both of these to work together.

In [2]:

import pandas as pd                 # Pandas dataframe library
import pandas_datareader as pdr     # Pandas datareader that allows me to lookup & store live crypto prices from yahoo finance.
import numpy as np                  # Numpy
import matplotlib.pyplot as pypl    # Pyplot used to create visuals/graphics based on data 
from alpha_vantage.timeseries import TimeSeries     # Library used for pulling live price data from alphavantage api

from datetime import datetime, timedelta, timezone             # Datetime library.
import pytz
import json
import csv
import warnings
warnings.simplefilter(action='ignore', category=ResourceWarning)
warnings.filterwarnings('ignore')

import glob                         # For changing/finding proper directory
import os                           # For changing/finding proper directory (when opening files)
import requests
import twint                        # Twitter web scraping tool with more features than the regular twitter API
import nest_asyncio                 # Import required for twint usage.
nest_asyncio.apply()                

import re                           # Regex for string cleaning (used for Textblob Sentiment Analysis)
from textblob import TextBlob       # Textblob used for sentiment analysis of cleaned data.

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer    # Sentiment analysis tool that works great on determining social media sentiment.
from newsapi import NewsApiClient   # NewsApiClient lets me look up/pull news articles relating to specified topics.
import requests                     # Used for sending get requests to the NewsAPI client.

from sklearn.preprocessing import MinMaxScaler                          # Scaler used for scaling data (LSTMRNN Implementation)
from sklearn.metrics import accuracy_score, classification_report       
from sklearn.model_selection import train_test_split                    # Used for splitting data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis    # Used for implementing SVM
import tensorflow as tf                                                 # TF used for LSTMRNN Implmentation
from keras.layers import Dense, Dropout, LSTM                           # Dense, dropout & lstm used for creating LSTMRNN 
from keras.models import Sequential                                     # Important because we're working with Sequential data.

os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\archive')
stopwords_file = open("stopwords.txt", "r+")
stopwords = list(stopwords_file.read().split('\n'))

## Reading in crypto price dataset
---
Section below reads csv files into pandas dataframes for interacting with. Also compiles list of coin names for twitter searching.

### What to do next:
* Retrieve Token labels from CSV file for searching by Cashtag on twitter.

In [107]:
path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\prices\DailyPrices'
extension = 'csv'
os.chdir(path)
daily_csv_files = glob.glob('*.{}'.format(extension))


path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\prices\HourlyPrices'
os.chdir(path)
hourly_csv_files = glob.glob('*.{}'.format(extension))

# Compile list of all coin names for searching on twitter later
daily_coins = []
hourly_coins = []

for coin in daily_csv_files:
    vals = coin.split("_")
    coin_name = vals[1][:-4]
    daily_coins.append(coin_name)

for coin in hourly_csv_files:
    vals = coin.split("_")
    coin_name = vals[0]
    hourly_coins.append(coin_name)

# compile list of pandas dataframes for use later.
hourly_coin_data = []

for file in hourly_csv_files:
    df = pd.read_csv(file)
    hourly_coin_data.append(df)


In [73]:
print(hourly_coin_data[0])

           unix                 date    symbol    open    high     low  \
0    1650931200  2022-04-26 00:00:00  AAVE/USD  169.23  170.68  168.38   
1    1650927600  2022-04-25 23:00:00  AAVE/USD  169.68  170.57  169.68   
2    1650924000  2022-04-25 22:00:00  AAVE/USD  168.21  169.51  168.21   
3    1650920400  2022-04-25 21:00:00  AAVE/USD  168.46  168.46  168.46   
4    1650916800  2022-04-25 20:00:00  AAVE/USD  169.83  169.90  169.83   
..          ...                  ...       ...     ...     ...     ...   
575  1648861200  2022-04-02 01:00:00  AAVE/USD  247.31  247.31  245.64   
576  1648857600  2022-04-02 00:00:00  AAVE/USD  242.29  256.74  242.29   
577  1648854000  2022-04-01 23:00:00  AAVE/USD  246.88  246.88  244.55   
578  1648850400  2022-04-01 22:00:00  AAVE/USD  250.60  250.60  246.39   
579  1648846800  2022-04-01 21:00:00  AAVE/USD  252.84  254.52  249.91   

      close  Volume AAVE    Volume USD  
0    169.48   174.216470  29526.207266  
1    170.10     1.814157    3

*NOTE:* The cell below is for reading in the Bitcoin tweets dataset from Kaggle. (https://www.kaggle.com/datasets/kaushiksuresh147/bitcoin-tweets)
This datset kinda sucks though. For a few reasons:
* Firstly, its tweets span 1.5 years but are only from 43 total days, making it inconsistent to use with Sequential data, like the price history.
* Secondly, it has some values in impropere columns (namely tag values in the date column) which have to be manually removed.
* Lastly, its huge. 280k tweets. Which at first seems great, but being that the sample size itself is incredibly sparse in terms of date-span, this leads to problems with implementation. 

I'll leave it here in a cell in case I decide to use it later, but for now, it doesn't apply to this project.

In [None]:
#!!! BELOW IS THE LOGIC FOR READING IN THE TWEETS FROM THE BITCOIN TWEET KAGGLE DATASET !!!
# Note: This dataset kinda sucks. It has some values in the 

# Logic for reading in Bitcoin tweets dataset.
# btc_tweets = pd.read_csv('../bitcoin_tweets/Bitcoin_tweets.csv')
# btc_tweets.drop([64943], axis=0, inplace=True)
# btc_tweets.drop([137068], axis=0, inplace=True)
# btc_tweets.drop([180575], axis=0, inplace=True)

# btc_tweets.drop(btc_tweets.index[100000:len(btc_tweets)], inplace=True)
# btc_tweets.drop(columns=['user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'source', 'is_retweet'], inplace=True)

## Scrape Twitter for data on all coins supplied by dataset
---
Below section of code searches through twitter using keywords. Uses sift_tweet() function to remove all unnecessary characters, links, emojis & words from tweets. Also uses Textblob to append polarity column to pandas df for tracking sentiment of tweets.

### What to do next:
* Search twitter based on Cashtags & Hashtags
* Configure Twint with Google translater so I can translate tweets from non-english langauges to english. (Need to create ticket for this in Github)

In [5]:
# Function for iterating through coins list and storing findings in .csv files
def search_coins(coins):
    
    for coin in coins:
        path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results'
        os.chdir(path)
        #os.mkdir(coin)
        os.chdir(coin)
        
        print('performing twitter search for coin:', coin)
        
        from_date = '2022-04-24'
        to_date = '2022-04-26'
        #coin = "Bitcoin"
        print(f'searching {from_date} to {to_date}')
        
        c = twint.Config()
        c.Limit = 3000
        c.Lang = "en"
        c.Pandas = True
        c.Search = coin
        c.Hide_output = True
        c.Since = from_date
        c.Until = to_date
        c.Store_csv = True
        c.Output = coin + '_' + from_date + '_' + to_date + '_search_result.csv'
        twint.run.Search(c)


# btc_tweets.text=btc_tweets.text.astype(str)
# btc_tweets['Processed Tweet'] = btc_tweets['text'].apply(lambda x: sift_tweet(x.lower(), stopwords)) 
# btc_tweets['Polarity/Subjectivity'] = btc_tweets['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment)            

# btc_tweets

search_coins(hourly_coins)

performing twitter search for coin: AAVE
searching 2022-04-24 to 2022-04-26


CRITICAL:root:twint.run:Twint:Feed:noData'globalObjects'
sleeping for 1.0 secs


[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
performing twitter search for coin: AVAX
searching 2022-04-24 to 2022-04-26
performing twitter search for coin: BCH
searching 2022-04-24 to 2022-04-26
performing twitter search for coin: BTC
searching 2022-04-24 to 2022-04-26
performing twitter search for coin: ETH
searching 2022-04-24 to 2022-04-26


Below chunk is more data pre-processing. 
I need to modify the dataframe so that it contains both the price information, as well as all of the tweets so I can easily perform sentiment analysis on them using VADER.

The code below will read all CSV files that were stored in both the hourly_prices directory (done earlier) as well as the tweets that are searchd for and stored in the search results folders for each currency.

In [49]:
# Need to create function for cleaning the tweets so we can derive the subjectivity and polarity using textblob.
def sift_tweet(tweet, stop_words):
    cleaned_tweet = tweet
    cleaned_tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet) # regex to remove all @userame, emojis, and links from tweets.
    for word in cleaned_tweet:
        if word in stop_words: cleaned_tweet.replace(word, '')
    return cleaned_tweet

def get_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

In [108]:
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results')
tweet_pds = []
grouped_tweets = []

# Read Tweets into a DF from the CSVs
for coin in hourly_coins:
    
    os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results')
    os.chdir(coin)
    csv_names = glob.glob('*.{}'.format(extension))
    coin_pds = []
    for file in csv_names:
        tweet_pd = pd.read_csv(file)
        tweet_pd.sort_values(by='date')
        coin_pds.append(tweet_pd)
    tweet_pds.append(coin_pds)


#for i in range(len(tweet_pds)):
# This is just so i can the data i need to train a model for aave and avax. I'll do all 5 when i want to showcase something but for now i only need these 2.
for i in range(1):
    i = 0
    print('lookin at coin number:', i)
    hourly_coin_data[i]['date'] = pd.to_datetime(hourly_coin_data[i]['date'])
    hourly_coin_data[i]['joined_tweets'] = ""
    hourly_coin_data[i]['compound'] = 0.0
    hourly_coin_data[i]['positive'] = 0.0
    hourly_coin_data[i]['negative'] = 0.0
    hourly_coin_data[i]['neutral'] = 0.0

    #print(hourly_coin_data[i])
    for j in range(len(tweet_pds[i])):
        tweet_pds[i][j]['created_at'] = pd.to_datetime(tweet_pds[i][j]['created_at'])

        for day in range(1,31):
            #print('checking day:', day)
            for hour in range(24):
                tweet_time_mask = (tweet_pds[i][j]['created_at'].dt.hour >= hour) & (tweet_pds[i][j]['created_at'].dt.hour < hour + 1) & \
                            (tweet_pds[i][j]['created_at'].dt.day >= day ) & (tweet_pds[i][j]['created_at'].dt.day < day + 1)
                price_time_mask = (hourly_coin_data[i]['date'].dt.hour >= hour) & (hourly_coin_data[i]['date'].dt.hour < hour + 1) & \
                            (hourly_coin_data[i]['date'].dt.day >= day ) & (hourly_coin_data[i]['date'].dt.day < day + 1)

                hour_view = tweet_pds[i][j][tweet_time_mask]
                if hour_view.empty:
                    continue
                
                hour_view['cleaned_tweet'] = hour_view['tweet'].apply(lambda x: sift_tweet(str(x).lower(), stopwords))

                joined_tweets = ' '.join(hour_view['tweet'])
                joined_clean_tweets = ' '.join(hour_view['cleaned_tweet'])

                SIA = get_sentiment(joined_tweets)
                compound = SIA['compound']                    # Score representing sum(lexicon ratings)
                pos = SIA['pos']
                neg = SIA['neg']
                neu = SIA['neu']

                index = hourly_coin_data[i][price_time_mask].index
                for ind in index:
                    hourly_coin_data[i].at[ind,'joined_tweets'] = joined_tweets
                    hourly_coin_data[i].at[ind,'polarity'] = TextBlob(joined_clean_tweets).sentiment[0]                # Analyze and store Polarity value in Coin Dataframe using Textblob
                    hourly_coin_data[i].at[ind,'subjectivity'] = TextBlob(joined_clean_tweets).sentiment[1]            # Analyze and store Subjectivity value in Coin Dataframe using Textblob
                    hourly_coin_data[i].at[ind,'compound'] = compound
                    hourly_coin_data[i].at[ind,'positive'] = pos
                    hourly_coin_data[i].at[ind,'negative'] = neg
                    hourly_coin_data[i].at[ind,'neutral'] = neu


lookin at coin number: 1


Dropping the rows that don't contain a polarity score. The only reason they wouldn't have this would be because they didn't have any tweets stored in their row for that hour.

In [111]:
hourly_coin_data[0] = hourly_coin_data[1][hourly_coin_data[1]['polarity'].notna()]

(61, 16)

Iterating through remaining rows and appending a price change label. This label signifies whether or not in that hour the price of the coin went up or down. This is what the model is going to be responsible for predicting.

In [112]:
for i in range(len(hourly_coin_data)):
    hourly_coin_data[i].reset_index()
    hourly_coin_data[i]['price_change'] = np.nan
    for index, row in hourly_coin_data[i].iterrows():
        if row.open > row.close:
            hourly_coin_data[i].at[index, 'price_change'] = 0
        else:
            hourly_coin_data[i].at[index, 'price_change'] = 1


In [131]:
hourly_coin_data[0] = pd.read_csv('model_df_0.csv')
hourly_coin_data[0].shape

(205, 12)

## Create and Train Neural Net on Dataset (Attempt 1)
---


### What to do next:
* Get more data and keep training.
* Try SVM implementation when enough data is gathered.

## Proper Implementation: LDA With Sentiment Analysis
---
Yeah the last one wasn't good. This one is ight.

# IMPORTANT:
---
* If you're running each cell in the jupyter notebook, you only need to run the below code cell. 

* If you're going to try to use the exported model_df_#.csv files that are saved in the hourly_coin_data directory, you need to run the 2nd cell below.

In [133]:
# These are all the columns we actually want to keep for the purposes of training & using the model.
model_cols = ['open', 'high', 'low', 'Volume USD', 'compound', 'positive', 'negative', 'neutral', 'polarity', 'subjectivity', 'price_change']
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\hourly_coin_data')

for i in range(1):
# for i in range(len(hourly_coin_data)):
    
    model_df = hourly_coin_data[i][model_cols]
    model_df.to_csv(f'model_df_{i}.csv')

    # Feature Dataset
    x = model_df
    # Target Dataset
    y = np.array(model_df['price_change'])
    x.drop(['price_change'], axis=1, inplace=True)
    np.asarray(x)
    
    print(x)

    # split into test & train
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    # Create svm model
    model = LinearDiscriminantAnalysis().fit(x_train, y_train)
    predictions = model.predict(x_test)
    print(classification_report(y_test, predictions))



       open    high     low     Volume USD  compound  positive  negative  \
0    169.31  169.78  166.84    4483.910281    0.9920     0.107     0.068   
1    170.40  170.70  168.83    3966.333567    0.9164     0.098     0.080   
2    169.27  170.87  167.84   14345.829974    0.9834     0.069     0.034   
3    171.35  171.35  171.35       0.000000   -0.9548     0.080     0.084   
4    171.56  172.03  171.35    2540.947375    0.9974     0.110     0.058   
..      ...     ...     ...            ...       ...       ...       ...   
200  199.76  199.97  196.41  103620.568202    0.0000     0.000     0.000   
201  196.94  197.98  193.62   86537.507634    0.0000     0.000     0.000   
202  193.67  193.83  193.21   11150.428138    0.0000     0.000     0.000   
203  226.39  229.02  226.14     574.700572   -0.8137     0.061     0.069   
204  236.79  236.79  235.36     419.296800    0.0000     0.000     0.000   

     neutral  polarity  subjectivity  
0      0.825 -0.067892      0.365060  
1      0.

In [138]:
# These are all the columns we actually want to keep for the purposes of training & using the model.
model_cols = ['open', 'high', 'low', 'Volume USD', 'compound', 'positive', 'negative', 'neutral', 'polarity', 'subjectivity', 'price_change']
os.chdir(r'C:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\hourly_coin_data')

model_df = pd.read_csv('model_df_0.csv')
model_df.drop(['drop_this'], axis=1, inplace=True)

# Feature Dataset
x = model_df
# Target Dataset
y = np.array(model_df['price_change'])
x.drop(['price_change'], axis=1, inplace=True)
np.asarray(x)


# split into test & train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


# Create svm model
model = LinearDiscriminantAnalysis().fit(x_train, y_train)
predictions = model.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      0.64      0.78        11
         1.0       0.88      1.00      0.94        30

    accuracy                           0.90        41
   macro avg       0.94      0.82      0.86        41
weighted avg       0.91      0.90      0.89        41



# Pull data from the last hour to make prediction


In [139]:
def send_request(url, headers, params, next_token=None):
    params['next_token'] = next_token
    response = requests.request('GET', url, headers=headers, params=params)
    print('Endpoint response code:' + str(response.status_code))
    if (response.status_code != 200):
        raise Exception(response.status_code, response.text)
    return response.json()

def pull_live_tweets(coin):

    # Pull tweets from the last hour
    path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\predicted_trends'
    os.chdir(path)
    #os.chdir(coin)

    print('performing twitter search for coin:', coin)

    # 1 hour ago
    from_date = datetime.now(timezone.utc) - timedelta(hours = 1)
    to_date = datetime.now(timezone.utc) + timedelta(seconds=-30)
    
    iso_from_date = from_date.isoformat()
    iso_to_date = to_date.isoformat()

    from_date = from_date.strftime('%Y-%m-%d %H:%M:%S')
    to_date = to_date.strftime('%Y-%m-%d %H:%M:%S')

    print(f'searching {from_date} to {to_date}')
    
    bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJwBbgEAAAAAyi3tWb4jDN72EZqz6dcWgOIizuc%3DsC3xrWGrxPCwiKwqy2fINUgJDs2qKaZNlITIIy75Pss1oiMeTN'

    headers = {
        "Authorization": "Bearer {}".format(bearer_token)
    }

    url = 'https://api.twitter.com/2/tweets/search/recent'

    params = {
        'query': coin,
        'start_time': iso_from_date,
        'end_time': iso_to_date,
        'max_results': 100,
        'next_token':{}
    }

    json_response = send_request(url, headers, params)
    return json_response

# Pull tweets on topic from last 30 minutes
fetched_tweets = pull_live_tweets('AVAX lang:en')
fetched_tweets_df = pd.DataFrame(fetched_tweets['data'])
fetched_tweets_df.to_csv('recently_fetched_tweets.csv')





performing twitter search for coin: AVAX lang:en
searching 2022-04-26 22:52:52 to 2022-04-26 23:52:22
Endpoint response code:200


In [144]:
# Pull financial data from yahoo finance for the current hour
# Uses AlphaVantage API with their CRYPTO_INTRADAY endpoint.

av_api_key = 'GD982KLZ6PZ69GQ0'
path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\prices\LivePrices'
os.chdir(path)

def get_prices(coin):
    url = f'https://www.alphavantage.co/query?function=CRYPTO_INTRADAY&symbol={coin}&market=USD&interval=1min&apikey={av_api_key}&datatype=csv'
    req = requests.get(url)
    data = req.content
    csv_file = open(f'{coin}_prices.csv','wb')
    csv_file.write(data)
    csv_file.close()
    return

get_prices('AVAX')  # Get the prices from the specified coin

# format that data into a dataframe
live_prices = pd.read_csv('AVAX_prices.csv')    # read in live prices csv
kept_prices = live_prices.head(60)              # keep only the last 60 minutes.
high = kept_prices['high'].max(axis=0)       # Find the max value in the last 60 minutes
low = kept_prices['low'].min(axis=0)        # find the lowesst value in the last 60 minutes
open_price = kept_prices['open'].values[59]                 # Price from 60 minutes ago. (opening price of the last hour) 
volume = kept_prices['volume'].sum(axis=0)      # summate the total volume traded from the last hour

live_coin_data = pd.DataFrame([[open_price, high, low, volume]], columns =['open', 'high', 'low', 'volume'])

# Run textblob on tweets for polarity & subjectivity
combined_tweets = ' '.join(fetched_tweets_df['text'])

# Clean tweet so we can use textblob on it.
fetched_tweets_df['cleaned_tweet'] = fetched_tweets_df['text'].apply(lambda x: sift_tweet(str(x).lower(), stopwords))
combined_cleaned_tweets = ' '.join(fetched_tweets_df['cleaned_tweet'])

            

# Get sentiment values on tweets using VADER sentiment analyzer
sia = get_sentiment(combined_tweets)
compound = sia['compound']                    # Score representing sum(lexicon ratings)
pos = sia['pos']
neg = sia['neg']
neu = sia['neu']

live_coin_data.loc[live_coin_data.index[0],'compound'] = compound  
live_coin_data.loc[live_coin_data.index[0],'pos'] = pos          
live_coin_data.loc[live_coin_data.index[0],'neg'] = neg            
live_coin_data.loc[live_coin_data.index[0],'neu'] = neu 
live_coin_data.loc[live_coin_data.index[0],'polarity'] = TextBlob(combined_cleaned_tweets).sentiment[0]            
live_coin_data.loc[live_coin_data.index[0],'subjectivity'] = TextBlob(combined_cleaned_tweets).sentiment[1]

live_coin_data

# make the prediction
model.predict(live_coin_data)
prob = model.predict_proba(live_coin_data)

print(prob[0])

[0.00834971 0.99165029]


And that's it! The above cell outputs the prediction from the model. 

* First # signifies probability of a decrease in price
* Second # signifiese probability of an increase in price