# Import Statements
---
**Important note:**
For some reason tensorflow version and numpy version have dependency conflicts. Need to figure out what version is stable for both of these to work together.

In [1]:

import pandas as pd                 # Pandas dataframe library
import pandas_datareader as pdr     # Pandas datareader that allows me to lookup & store live crypto prices from yahoo finance.
import numpy as np                  # Numpy
import matplotlib.pyplot as pypl    # Pyplot used to create visuals/graphics based on data 
import datetime as dt               # Datetime library.
import time

import glob                         # For changing/finding proper directory
import os                           # For changing/finding proper directory (when opening files)

import twint                        # Twitter web scraping tool with more features than the regular twitter API
import nest_asyncio                 # Import required for twint usage.
nest_asyncio.apply()                

import re                           # Regex for string cleaning (used for Textblob Sentiment Analysis)
from textblob import TextBlob       # Textblob used for sentiment analysis of cleaned data.

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer    # Sentiment analysis tool that works great on determining social media sentiment.
from newsapi import NewsApiClient   # NewsApiClient lets me look up/pull news articles relating to specified topics.
import requests                     # Used for sending get requests to the NewsAPI client.

from sklearn.preprocessing import MinMaxScaler                          # Scaler used for scaling data (LSTMRNN Implementation)
from sklearn.metrics import accuracy_score, classification_report       
from sklearn.model_selection import train_test_split                    # Used for splitting data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis    # Used for implementing SVM
import tensorflow as tf                                                 # TF used for LSTMRNN Implmentation
from keras.layers import Dense, Dropout, LSTM                           # Dense, dropout & lstm used for creating LSTMRNN 
from keras.models import Sequential                                     # Important because we're working with Sequential data.

## Reading in crypto price dataset
---
Section below reads csv files into pandas dataframes for interacting with. Also compiles list of coin names for twitter searching.

### What to do next:
* Retrieve Token labels from CSV file for searching by Cashtag on twitter.

In [133]:
path = r'c:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\prices\DailyPrices'
extension = 'csv'
os.chdir(path)
daily_csv_files = glob.glob('*.{}'.format(extension))


path = r'c:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\prices\HourlyPrices'
os.chdir(path)
hourly_csv_files = glob.glob('*.{}'.format(extension))

# Compile list of all coin names for searching on twitter later
daily_coins = []
hourly_coins = []

for coin in daily_csv_files:
    vals = coin.split("_")
    coin_name = vals[1][:-4]
    daily_coins.append(coin_name)

for coin in hourly_csv_files:
    vals = coin.split("_")
    coin_name = vals[0]
    hourly_coins.append(coin_name)

# compile list of pandas dataframes for use later.
hourly_coin_data = []

for file in hourly_csv_files:
    df = pd.read_csv(file)
    hourly_coin_data.append(df)

hourly_coin_data[0].head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume AAVE,Volume USD
0,1649721600,2022-04-12 00:00:00,AAVE/USD,157.76,159.28,157.76,159.14,73.351666,11673.184137
1,1649718000,2022-04-11 23:00:00,AAVE/USD,160.55,160.55,157.5,157.68,112.105089,17676.73043
2,1649714400,2022-04-11 22:00:00,AAVE/USD,156.71,160.42,155.77,159.11,24.787057,3943.86867
3,1649710800,2022-04-11 21:00:00,AAVE/USD,160.7,160.84,157.81,157.83,310.394127,48989.50499
4,1649707200,2022-04-11 20:00:00,AAVE/USD,162.31,162.34,160.04,162.34,272.049191,44164.465641


*NOTE:* The cell below is for reading in the Bitcoin tweets dataset from Kaggle. (https://www.kaggle.com/datasets/kaushiksuresh147/bitcoin-tweets)
This datset kinda sucks though. For a few reasons:
* Firstly, its tweets span 1.5 years but are only from 43 total days, making it inconsistent to use with Sequential data, like the price history.
* Secondly, it has some values in impropere columns (namely tag values in the date column) which have to be manually removed.
* Lastly, its huge. 280k tweets. Which at first seems great, but being that the sample size itself is incredibly sparse in terms of date-span, this leads to problems with implementation. 

I'll leave it here in a cell in case I decide to use it later, but for now, it doesn't apply to this project.

In [None]:
#!!! BELOW IS THE LOGIC FOR READING IN THE TWEETS FROM THE BITCOIN TWEET KAGGLE DATASET !!!
# Note: This dataset kinda sucks. It has some values in the 

# Logic for reading in Bitcoin tweets dataset.
# btc_tweets = pd.read_csv('../bitcoin_tweets/Bitcoin_tweets.csv')
# btc_tweets.drop([64943], axis=0, inplace=True)
# btc_tweets.drop([137068], axis=0, inplace=True)
# btc_tweets.drop([180575], axis=0, inplace=True)

# btc_tweets.drop(btc_tweets.index[100000:len(btc_tweets)], inplace=True)
# btc_tweets.drop(columns=['user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'source', 'is_retweet'], inplace=True)

In [81]:
path = r'c:\Users\WaKaBurd\Documents\GitHub\CryptoPredictionTool\search_results'
for coin in hourly_coins: 
        os.chdir(path)
        os.mkdir(coin)

## Scrape Twitter for data on all coins supplied by dataset
---
Below section of code searches through twitter using keywords. Uses sift_tweet() function to remove all unnecessary characters, links, emojis & words from tweets. Also uses Textblob to append polarity column to pandas df for tracking sentiment of tweets.

### What to do next:
* Search twitter based on Cashtags & Hashtags
* Configure Twint with Google translater so I can translate tweets from non-english langauges to english. (Need to create ticket for this in Github)

In [58]:
os.chdir(r'C:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\archive')

# Need to create function for cleaning the tweets.
def sift_tweet(tweet, stop_words):
    cleaned_tweet = tweet
    cleaned_tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet) # regex to remove all @userame, emojis, and links from tweets.
    for word in cleaned_tweet:
        if word in stop_words: cleaned_tweet.replace(word, '')
    return cleaned_tweet

# Function for iterating through coins list and storing findings in .csv files
def search_coins(coins):
    important_cols = ['date', 'created_at', 'tweet']
    coin_counter = 0
    
    for coin in coins:
        path = r'c:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\search_results'
        os.chdir(path)
        #os.mkdir(coin)
        os.chdir(coin)
        
        print('performing twitter search for coin:', coin)
        
        # Comprises list of hours from our hourly dataset so we can run twint searches on those hours only.
        hours = []
        for hour in hourly_coin_data[coin_counter]['date']:
            hours.append(hour)
        
        for i in range(len(hours)-1):
            
            # to_date = hours[i]
            # from_date = hours[i+1]
           
            from_date = '2022-04-10'
            to_date = '2022-04-12'
            #coin = "Bitcoin"
            print(f'searching {from_date} to {to_date}')
            
            c = twint.Config()
            c.Limit = 3000
            c.Lang = "en"
            c.Pandas = True
            c.Search = coin
            c.Hide_output = True
            c.Since = from_date
            c.Until = to_date
            c.Store_csv = True
            c.Output = coin + '_' + from_date + '_' + to_date + '_search_result.csv'
            twint.run.Search(c)
            coin_df = twint.storage.panda.Tweets_df
            break
            
            # important for when twint fails to find tweets based on currency.
            # if coin_df.empty:
            #     print('no results, moving on...')
            #     break
            
        
        
            # Processing twitter live tweets
            # coin_df['Processed Tweet'] = coin_df['tweet'].apply(lambda x: sift_tweet(x.lower(), stopwords))       # Lambda function for creating processed tweets in Coin Dataframe 
            # coin_df['Polarity'] = coin_df['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment[0])            # Lambda function for creating Polarity value in Coin Dataframe using Textblob
            
            
            # from_date = re.sub(' ','-',from_date)
            # from_date = re.sub(':','-',from_date)
            
            
            #coin_df.to_csv(from_date + '-' + to_date + '.csv')

# btc_tweets.text=btc_tweets.text.astype(str)
# btc_tweets['Processed Tweet'] = btc_tweets['text'].apply(lambda x: sift_tweet(x.lower(), stopwords)) 
# btc_tweets['Polarity/Subjectivity'] = btc_tweets['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment)            

# btc_tweets

search_coins(hourly_coins)

performing twitter search for coin: AAVE
searching 2022-04-10 to 2022-04-12
performing twitter search for coin: AVAX
searching 2022-04-10 to 2022-04-12
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
performing twitter search for coin: BCH
searching 2022-04-10 to 2022-04-12
performing twitter search for coin: BTC
searching 2022-04-10 to 2022-04-12
performing twitter search for coin: ETH
searching 2022-04-10 to 2022-04-12


In [148]:

os.chdir(r'C:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\search_results')
tweet_pds = []
grouped_tweets = []

# Read Tweets into a CSV
for coin in hourly_coins:
    
    os.chdir(r'C:\Users\Brand\OneDrive\Documents\GitHub\CryptoPredictionTool\search_results')
    os.chdir(coin)
    csv_names = glob.glob('*.{}'.format(extension))
    coin_pds = []
    for file in csv_names:
        tweet_pd = pd.read_csv(file)
        tweet_pd.sort_values(by='date')
        coin_pds.append(tweet_pd)
    tweet_pds.append(coin_pds)


tweet_pds[0][0]['date'] = pd.to_datetime(tweet_pds[0][0]['date'])
hourly_coin_data[0]['date'] = pd.to_datetime(hourly_coin_data[0]['date'])
hourly_coin_data[0]['joined_tweets'] = np.nan
time_and_tweets = []

for day in range(1,15):
    for hour in range(24):
        tweet_time = []
        combined_tweets = []
        tweet_time_mask = (tweet_pds[0][0]['date'].dt.hour >= hour) & (tweet_pds[0][0]['date'].dt.hour < hour + 1) & \
                    (tweet_pds[0][0]['date'].dt.day >= day ) & (tweet_pds[0][0]['date'].dt.day < day + 1)
        price_time_mask = (hourly_coin_data[0]['date'].dt.hour >= hour) & (hourly_coin_data[0]['date'].dt.hour < hour + 1) & \
                    (hourly_coin_data[0]['date'].dt.day >= day ) & (hourly_coin_data[0]['date'].dt.day < day + 1)


        hour_view = tweet_pds[0][0][tweet_time_mask]
        if hour_view.empty:
            continue
        
        joined_tweets = ' '.join(hour_view['tweet'])

        index = hourly_coin_data[0][price_time_mask].index
        for i in index:
            hourly_coin_data[0].at[i,'joined_tweets'] = joined_tweets
        
        continue

        


        time_and_tweets.append(joined_tweets)

        # This groups the tweets by hour, then outputs how many tweets there are in that hour.
        #grouped_tweets = tweet_pds[0][0].groupby(pd.Grouper(key='date', freq='60Min')).count()


        #grouped_tweets['joined_tweets'] = joined_tweets
        #print(grouped_tweets)
# print(tweet_pds[0][0]['date'])
# def split_tweets(tweets):
#     return
hourly_coin_data[0] = hourly_coin_data[0][hourly_coin_data[0]['joined_tweets'].notna()]

print(len(hourly_coin_data[0]))
#pd.read_csv(file)

48


In [101]:
hourly_coin_data[0].head()

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume AAVE,Volume USD
0,1649721600,2022-04-12 00:00:00,AAVE/USD,157.76,159.28,157.76,159.14,73.351666,11673.184137
1,1649718000,2022-04-11 23:00:00,AAVE/USD,160.55,160.55,157.5,157.68,112.105089,17676.73043
2,1649714400,2022-04-11 22:00:00,AAVE/USD,156.71,160.42,155.77,159.11,24.787057,3943.86867
3,1649710800,2022-04-11 21:00:00,AAVE/USD,160.7,160.84,157.81,157.83,310.394127,48989.50499
4,1649707200,2022-04-11 20:00:00,AAVE/USD,162.31,162.34,160.04,162.34,272.049191,44164.465641


## Create and Train Neural Net on Dataset (Attempt 1)
---


### What to do next:
* Probably attempt it differently. Outcomes are horrid.

## Proper Implementation: SVM With Sentiment Analysis
---
Yeah the last one wasn't good.

In [133]:
# btc_tweets['Compound'] = compound
# btc_tweets['Positive'] = pos
# btc_tweets['Negative'] = neg
# btc_tweets['Neutral'] = neu

# These are all the columns we actually want to keep for the purposes of training & using the model.
model_cols = ['Label','Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Compound', 'Positive', 'Negative', 'Neutral']

btc_prices['Date'] = pd.to_datetime(btc_prices['Date'])
#btc_prices['Date'] = btc_prices['Date'].dt.date
btc_tweets['date'] = pd.to_datetime(btc_tweets['date'])
#btc_tweets['date'] = btc_tweets['date'].dt.date

#Create function to get Sentiment Scores
def getSIA(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

#Get sentiment scores for each day
compound, pos, neg, neu = [], [], [], []
SIA = 0

for i in range(0, len(btc_tweets['text'])):
    SIA = getSIA(btc_tweets['text'][i])
    compound.append(SIA['compound'])                    # Score representing sum(lexicon ratings)
    pos.append(SIA['pos'])
    neg.append(SIA['neg'])
    neu.append(SIA['neu'])

btc_tweets['Compound'] = compound
btc_tweets['Positive'] = pos
btc_tweets['Negative'] = neg
btc_tweets['Neutral'] = neu

In [231]:
print(btc_tweets.loc[btc_tweets['date'] == '[\'YieldFarming\', \'Airdrop\', \'Binance\', \'Bitcoin\', \'pancakeswap\', \'BNB\', \'cryptocurrency\', \'DeFi\', \'BTC\', \'BinanceSmartChain\', \'BSC\', \'pufferswap\', \'DeFi\', \'bsc\', \'bnb\', \'bitcoin\', \'cryto\', \'Airdrop\', \'Airdrop\']'])
#btc_tweets.drop([64943], axis=0, inplace=True)
#btc_tweets.drop([137068], axis=0, inplace=True)
#btc_tweets.drop([180575], axis=0, inplace=True)

btc_tweets['date'] = pd.to_datetime(btc_tweets['date'])
btc_tweets['date'] = btc_tweets['date'].dt.date

btc_tweets = btc_tweets.sort_values(by='date', ascending=True)
dataset_dates = btc_tweets['date'].unique()
num_days = 0
for date in dataset_dates:
    print(date)
    num_days += 1

print(num_days)

split_date = datetime.date(2021,2,10)

# train = btc_tweets.loc[btc_tweets['date'] == split_date]
# test = btc_tweets.loc[btc_tweets['date'] >= split_date]


#print(train)

Empty DataFrame
Columns: [date, text, hashtags]
Index: []
2021-02-05
2021-02-06
2021-02-07
2021-02-08
2021-02-09
2021-02-10
2021-02-13
2021-02-14
2021-02-15
2021-02-18
2021-02-19
2021-02-22
2021-02-28
2021-03-11
2021-03-12
2021-04-05
2021-04-06
2021-04-07
2021-04-08
2021-04-09
2021-04-10
2021-04-11
2021-04-12
2021-04-17
2021-04-18
2021-04-19
2021-04-20
2021-04-21
2021-04-22
2021-04-23
2021-04-24
2021-05-25
2021-05-26
2021-05-27
2021-05-28
2021-05-29
2021-06-20
2021-06-21
2021-06-22
2021-06-23
2021-07-04
2021-07-05
NaT
43


# Newsapi example function call
---
* This will be used in the case that I decide to try to integrate news posts for sentiment analysis.

In [255]:
url = 'https://newsapi.org/v2/everything?'
parameters = {
    'q': 'bitcoin',
    'from': '2022-03-12',
    'to': '2022-04-11',
    'sortBy': 'popularity',
    'apiKey': 'f2162fa3a1ed4fa29bb14cb6a737be55'
}

response = requests.get(url, parameters)
headlines = pd.DataFrame(response.json())

headlines.head(3)


Unnamed: 0,status,totalResults,articles
0,ok,11728,"{'source': {'id': 'engadget', 'name': 'Engadge..."
1,ok,11728,"{'source': {'id': 'engadget', 'name': 'Engadge..."
2,ok,11728,"{'source': {'id': 'wired', 'name': 'Wired'}, '..."
