## Fetch Twitter data from APIs

### Attempt twitter fetch APIs

In [9]:
# Data fetcher APIs
import tweepy
import GetOldTweets3 as got

import pandas as pd
import dask.dataframe as dd
import dask

import numpy as np
import matplotlib.pyplot as plt

import re

### Attempt Twitter API

In [31]:
twitter_credentials = pd.read_csv('twitter_keys.csv')
consumer_key = twitter_credentials['API_KEY'][0]
consumer_secret = twitter_credentials['API_KEY_SECRET'][0]
bearer_token = twitter_credentials['BEARER_TOKEN'][0]
access_token = twitter_credentials['ACCESS_TOKEN'][0]
access_token_secret = twitter_credentials['ACCESS_TOKEN_SECRET'][0]

In [32]:
auth = tweepy.OAuth2BearerHandler(bearer_token)
api = tweepy.API(auth)

In [41]:
# Filter out retweets
search_term = '#bitcoin -filter:retweets'

tweets = tweepy.Cursor(
    api.search_tweets,
    q=search_term,
    lang='en',
#     since='2018-01-01',
    tweet_mode='extended'
).items(2000)

<tweepy.cursor.ItemIterator at 0x1f53eddfc08>

In [42]:
all_tweets = [tweet.full_text for tweet in tweets]

In [44]:
len(all_tweets)

2000

**Cannot use Twitter API as the tweets are limited to only the past 7 days**

### Attempt GOT package

In [72]:
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(search_term)\
                                           .setSince("2019-03-29")\

tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



An error occured during an HTTP request: HTTP Error 403: Forbidden
Try to open in browser: https://twitter.com/search?q=%23bitcoin%20-filter%3Aretweets%20since%3A2019-03-29&src=typd
Traceback (most recent call last):
  File "c:\users\acer\appdata\local\programs\python\python37\lib\site-packages\GetOldTweets3\manager\TweetManager.py", line 343, in getJsonResponse
    response = opener.open(url)
  File "c:\users\acer\appdata\local\programs\python\python37\lib\urllib\request.py", line 531, in open
    response = meth(req, response)
  File "c:\users\acer\appdata\local\programs\python\python37\lib\urllib\request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "c:\users\acer\appdata\local\programs\python\python37\lib\urllib\request.py", line 569, in error
    return self._call_chain(*args)
  File "c:\users\acer\appdata\local\programs\python\python37\lib\urllib\request.py", line 503, in _call_chain
    result = func(*args)
  File "c:\users\acer\appdata\l

TypeError: object of type 'NoneType' has no len()

Also rendered useless due to the Twitter restrictions

### Kaggle dataset

* [data small](https://www.kaggle.com/datasets/kaushiksuresh147/bitcoin-tweets)
  * Tweets of feb 5th 2021 - Oct 15th 2022 (inclusive, but not all included)
* [data large](https://www.kaggle.com/datasets/alaix14/bitcoin-tweets-20160101-to-20190329)
  * Tweets of jan 2nd 2014 - nov 23rd 2019 (inclusive)

*Attempt pandas*

In [2]:
df_small = pd.read_csv('../../data/Bitcoin_tweets.csv')

KeyboardInterrupt: 

Pandas takes so long and errors out

*Attempt dask*

In [14]:
df_small = dd.read_csv(
    '../../data/Bitcoin_tweets.csv',
    usecols=['user_name', 'user_description', 'date', 'text', 'hashtags']
)

In [15]:
df_small.head()

Unnamed: 0,user_name,user_description,date,text,hashtags
0,DeSota Wilson,"Biz Consultant, real estate, fintech, startups...",2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin']
1,CryptoND,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']"
2,Tdlmatias,"IM Academy : The best #forex, #SelfEducation, ...",2021-02-10 23:54:48,"Guys evening, I have read this article about B...",
3,Crypto is the future,I will post a lot of buying signals for BTC tr...,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']"
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC']


Was able to read the small dataset, attempt large

In [55]:
df_large = dd.read_csv(
    '../../data/Bitcoin_tweets_large.csv',
    delimiter=';',
    usecols=['user', 'fullname', 'timestamp', 'text'],
    engine='python',
    on_bad_lines='skip'
)

In [56]:
df_large.head()

Unnamed: 0,user,fullname,timestamp,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14+00,È appena uscito un nuovo video! LES CRYPTOMONN...
1,bitcointe,Bitcointe,2019-05-27 11:49:18+00,Cardano: Digitize Currencies; EOS https://t.co...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06+00,Another Test tweet that wasn't caught in the s...
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22+00,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23+00,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [61]:
df_large['timestamp'] = dd.to_datetime(df_large['timestamp'])

In [64]:
df_large.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, user to text
dtypes: datetime64[ns](1), object(3)

In [71]:
# Check if any null timestamps
df_large['timestamp'].isnull().sum().compute()

823

In [77]:
df_large.size.compute()

67562152

In [80]:
# Remove empty timestamp values to facilitate sorting
df_large_cleaned = df_large.dropna(subset=['timestamp'])

In [84]:
df_large_cleaned.sort_values('timestamp', inplace=True).compute()

Unnamed: 0,user,fullname,timestamp,text
114517,chrispychong,chrispy,2007-04-19 07:14:38+00:00,is happily mugging at BTC where she will hook ...
21,halfin,halfin,2009-01-11 03:33:52+00:00,Running bitcoin
45666,halfin,halfin,2009-01-21 17:29:40+00:00,Looking at ways to add more anonymity to bitcoin
91597,halfin,halfin,2009-01-27 20:14:10+00:00,Thinking about how to reduce CO2 emissions fro...
18020,fafcffacfff,GoldLover,2009-01-29 13:37:53+00:00,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...
...,...,...,...,...
139141,theautomatski,Automatski,2019-11-23 15:45:53+00:00,The Worlds First Millennium Firm https://t.co/...
139143,digital_mine_,digital mine ⚡🤖👨‍💻🇭🇰🇭🇰🇭🇰,2019-11-23 15:45:55+00:00,Daily profit for HODLING BTC since 2013 Data t...
139145,Vizique,Vizique,2019-11-23 15:45:55+00:00,Bitcoin Suisse Certificates :) https://t.co/nd...
139146,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56+00:00,Register now for the early access of the Codug...


In [83]:
df_large_cleaned.tail()

Unnamed: 0,user,fullname,timestamp,text
139144,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50+00:00,Happy #FibonacciDay \n\nA while back I created...
139145,Vizique,Vizique,2019-11-23 15:45:55+00:00,Bitcoin Suisse Certificates :) https://t.co/nd...
139146,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56+00:00,Register now for the early access of the Codug...
139147,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57+00:00,@btc \n@btc \nDo you know that BTC Baskets isn...
139148,HaraldoXRP,Harry,2019-11-23 15:45:06+00:00,$BTC - an update on the longer term view for B...


In [98]:
# Remove unneeded time section
df_large_cleaned['timestamp'] = df_2_cleaned['timestamp'].dt.strftime('%m/%d/%Y')

In [134]:
df_large_cleaned.head()

Unnamed: 0,user,fullname,timestamp,text
0,KamdemAbdiel,Abdiel kamdem,05/27/2019,È appena uscito un nuovo video! LES CRYPTOMONN...
1,bitcointe,Bitcointe,05/27/2019,Cardano: Digitize Currencies; EOS https://t.co...
2,3eyedbran,Bran - 3 Eyed Raven,05/27/2019,Another Test tweet that wasn't caught in the s...
3,DetroitCrypto,J. Scardina,05/27/2019,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,mmursaleen72,Muhammad Mursaleen,05/27/2019,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [149]:
df_large_cleaned.tail()

Unnamed: 0,user,fullname,timestamp,text
139144,JacobCanfield,Jacob Canfield,11/23/2019,Happy #FibonacciDay \n\nA while back I created...
139145,Vizique,Vizique,11/23/2019,Bitcoin Suisse Certificates :) https://t.co/nd...
139146,torusJKL,Gal Buki ($torusJKL),11/23/2019,Register now for the early access of the Codug...
139147,Adekunl95628158,Adekunle Daniel,11/23/2019,@btc \n@btc \nDo you know that BTC Baskets isn...
139148,HaraldoXRP,Harry,11/23/2019,$BTC - an update on the longer term view for B...


In [150]:
df_large_cleaned.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, user to text
dtypes: object(4)

In [136]:
# Filter old data
df_large_filtered = df_large_cleaned[df_large_cleaned['timestamp'] > '01/01/2014']

In [151]:
df_large_filtered.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, user to text
dtypes: object(4)

In [137]:
df_large_filtered.size.compute()

67554192

In [None]:
# Set date as index to repartition
df_large_filtered_indexed = df_large_filtered.set_index('timestamp')

In [144]:
df_large_filtered_indexed.head()

Unnamed: 0_level_0,user,fullname,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/01/2015,BTCticker,bitcointicker.co,One Bitcoin now worth $315.63@bitstamp. High $...
01/01/2015,ProjectCoin,Project Coin,LIVE: Profit = $712.36 (27.66 %). BUY B8.16 @ ...
01/01/2015,airdroplite,Airdrop Lite,BTCTurk 762.55 TL Koinim 770 TL CampBx 330.00 ...
01/01/2015,BTCticker,bitcointicker.co,One Bitcoin now worth $314.75@bitstamp. High $...
01/01/2015,BitcoinSpreads,Bitcoin Spreads,1 #BTC (#Bitcoin) quotes:\n$314.62/$315.00 #Bi...


In [145]:
df_large_filtered_indexed.tail()

Unnamed: 0_level_0,user,fullname,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12/31/2018,Uberbills,Uberbills.com,"BTC: $3751.00, S: $Looking Fo, G: $ Spot Pric ..."
12/31/2018,bt_exchange,Bitcoin Exchange BR,1 BTC = 15228.57976000 BRL em 31/12/2018 ás 12...
12/31/2018,BitcoinCash_ES,BitcoinCash_es,Cotización del Bitcoin Cash: 136 80.€ | -0.15%...
12/31/2018,asens_inc,ASENS inc.,"12/31 23:00現在\n\n#Bitcoin : 410,395円↓\n#NEM #X..."
12/31/2018,short_vs_long,SHORT vs LONG(Beta),24H\n2019/01/01 02:00 (2018/12/31 01:59)\nLONG...


In [146]:
# Dataframe is extremely large - split into chunks
df_large_repartitioned = df_large_filtered_indexed.repartition(partition_size='100MB')

AttributeError: 'DataFrame' object has no attribute 'repartition'

In [59]:
df_2_repartitioned.to_csv('../../data/dataframes/data-*.csv')

KeyboardInterrupt: 

```
Process limited by available memory -> Moving to Kaggle as it has 30GB RAM

break_file_chunks.ipynb - discontinued
bitcoin_tweets_filteration_large.ipynb
bitcoin_tweets_filteration_small.ipynb

```


***
A wide range of CSV's have been created, each for each individual day. However, the `bitcoin_tweets_filteration_small.ipynb` scraper does not contain all the dates between the two ranges... which is a significant issue, and requires scraping

### check which dates are missing till last available date

In [1]:
from datetime import date, timedelta
import os

start_date = date(2014, 1, 2) 
end_date = date(2022, 10, 15)
delta = end_date - start_date
days = []

for i in range(delta.days + 1):
    day = start_date + timedelta(days=i)
    days.append(day)

In [2]:
available_days_small = os.listdir('../../../data/Tweets/broken_small_sets_limited')
available_days_large = os.listdir('../../../data/Tweets/broken_large_sets_limited')

In [3]:
available_days_small = [day.replace('.csv', '') for day in available_days_small]
available_days_large = [day.replace('.csv', '') for day in available_days_large]

available_days_small[:10], available_days_large[:10]

(['2021-02-05',
  '2021-02-06',
  '2021-02-07',
  '2021-02-08',
  '2021-02-09',
  '2021-02-10',
  '2021-02-13',
  '2021-02-14',
  '2021-02-15',
  '2021-02-18'],
 ['2014-01-02',
  '2014-01-03',
  '2014-01-04',
  '2014-01-05',
  '2014-01-06',
  '2014-01-07',
  '2014-01-08',
  '2014-01-09',
  '2014-01-10',
  '2014-01-11'])

In [4]:
available_days = available_days_small + available_days_large
available_days[:10]

['2021-02-05',
 '2021-02-06',
 '2021-02-07',
 '2021-02-08',
 '2021-02-09',
 '2021-02-10',
 '2021-02-13',
 '2021-02-14',
 '2021-02-15',
 '2021-02-18']

In [5]:
days = [str(day) for day in days]
days[:10]

['2014-01-02',
 '2014-01-03',
 '2014-01-04',
 '2014-01-05',
 '2014-01-06',
 '2014-01-07',
 '2014-01-08',
 '2014-01-09',
 '2014-01-10',
 '2014-01-11']

In [6]:
missing_days = [day for day in days if day not in available_days]
missing_days[:10]

['2019-11-24',
 '2019-11-25',
 '2019-11-26',
 '2019-11-27',
 '2019-11-28',
 '2019-11-29',
 '2019-11-30',
 '2019-12-01',
 '2019-12-02',
 '2019-12-03']

In [7]:
len(missing_days), len(available_days), len(days)

(854, 2359, 3209)

```
Attempt fetching tweets of missing dates on tweet_scraper.ipynb -> Attempted in Kaggle due to better computation power
```

In [37]:
# Save list of missing days to load into Kaggle kernel
with open(r'../../../data/tweets/broken_small_sets_limited/missing_days.txt', 'w') as fp:
    for day in missing_days:
        fp.write("%s\n" % day)
    print('Done')

Done


**Data scraped within `tweet_scraper.ipynb` check which dates are still missing**

In [2]:
with open('../../../data/tweets/broken_small_sets_limited/missing_days.txt') as f:
    missing_days = [line.replace('\n', '') for line in f.readlines()]

In [3]:
missing_days

['2019-11-24',
 '2019-11-25',
 '2019-11-26',
 '2019-11-27',
 '2019-11-28',
 '2019-11-29',
 '2019-11-30',
 '2019-12-01',
 '2019-12-02',
 '2019-12-03',
 '2019-12-04',
 '2019-12-05',
 '2019-12-06',
 '2019-12-07',
 '2019-12-08',
 '2019-12-09',
 '2019-12-10',
 '2019-12-11',
 '2019-12-12',
 '2019-12-13',
 '2019-12-14',
 '2019-12-15',
 '2019-12-16',
 '2019-12-17',
 '2019-12-18',
 '2019-12-19',
 '2019-12-20',
 '2019-12-21',
 '2019-12-22',
 '2019-12-23',
 '2019-12-24',
 '2019-12-25',
 '2019-12-26',
 '2019-12-27',
 '2019-12-28',
 '2019-12-29',
 '2019-12-30',
 '2019-12-31',
 '2020-01-01',
 '2020-01-02',
 '2020-01-03',
 '2020-01-04',
 '2020-01-05',
 '2020-01-06',
 '2020-01-07',
 '2020-01-08',
 '2020-01-09',
 '2020-01-10',
 '2020-01-11',
 '2020-01-12',
 '2020-01-13',
 '2020-01-14',
 '2020-01-15',
 '2020-01-16',
 '2020-01-17',
 '2020-01-18',
 '2020-01-19',
 '2020-01-20',
 '2020-01-21',
 '2020-01-22',
 '2020-01-23',
 '2020-01-24',
 '2020-01-25',
 '2020-01-26',
 '2020-01-27',
 '2020-01-28',
 '2020-01-

In [5]:
import os
scraped_days = os.listdir('../../../data/Tweets/tweets_scraped/total')

In [7]:
scraped_days = [day.replace('.csv', '') for day in scraped_days]
scraped_days

['2019-11-24',
 '2019-11-25',
 '2019-11-26',
 '2019-11-27',
 '2019-11-28',
 '2019-11-29',
 '2019-11-30',
 '2019-12-01',
 '2019-12-02',
 '2019-12-03',
 '2019-12-04',
 '2019-12-05',
 '2019-12-06',
 '2019-12-07',
 '2019-12-08',
 '2019-12-09',
 '2019-12-10',
 '2019-12-11',
 '2019-12-12',
 '2019-12-13',
 '2019-12-14',
 '2019-12-15',
 '2019-12-16',
 '2019-12-17',
 '2019-12-18',
 '2019-12-19',
 '2019-12-20',
 '2019-12-21',
 '2019-12-22',
 '2019-12-23',
 '2019-12-24',
 '2019-12-25',
 '2019-12-26',
 '2019-12-27',
 '2019-12-28',
 '2019-12-29',
 '2019-12-30',
 '2019-12-31',
 '2020-01-01',
 '2020-01-02',
 '2020-01-03',
 '2020-01-04',
 '2020-01-05',
 '2020-01-06',
 '2020-01-07',
 '2020-01-08',
 '2020-01-09',
 '2020-01-10',
 '2020-01-11',
 '2020-01-12',
 '2020-01-13',
 '2020-01-14',
 '2020-01-15',
 '2020-01-16',
 '2020-01-17',
 '2020-01-18',
 '2020-01-19',
 '2020-01-20',
 '2020-01-21',
 '2020-01-22',
 '2020-01-23',
 '2020-01-24',
 '2020-01-25',
 '2020-01-26',
 '2020-01-27',
 '2020-01-28',
 '2020-01-

In [9]:
still_missing_days = [day for day in missing_days if day not in scraped_days]

In [10]:
still_missing_days

['2020-09-18',
 '2021-02-04',
 '2021-02-12',
 '2021-02-17',
 '2021-02-21',
 '2021-02-27',
 '2021-03-10',
 '2021-04-04',
 '2021-04-16',
 '2021-05-07',
 '2021-05-24',
 '2021-06-19',
 '2021-07-01',
 '2021-07-15',
 '2021-08-03',
 '2021-08-13',
 '2021-08-22',
 '2021-09-09',
 '2021-10-17',
 '2021-10-26',
 '2021-11-03',
 '2021-11-10',
 '2021-11-17',
 '2021-11-23',
 '2021-12-10',
 '2021-12-16',
 '2021-12-28',
 '2022-01-10',
 '2022-01-18',
 '2022-02-08',
 '2022-02-10',
 '2022-02-13',
 '2022-03-01',
 '2022-03-05',
 '2022-03-13',
 '2022-04-12',
 '2022-04-26',
 '2022-05-09',
 '2022-05-21',
 '2022-06-13',
 '2022-06-16',
 '2022-06-22',
 '2022-06-27',
 '2022-07-01',
 '2022-07-10',
 '2022-08-29',
 '2022-09-10',
 '2022-10-08']

In [12]:
len(still_missing_days), len(missing_days), len(scraped_days)

(48, 854, 850)

In [18]:
# Save list of missing days to load into Kaggle kernel
with open(r'../../../data/tweets/still_missing_days.txt', 'w') as fp:
    for day in still_missing_days:
        fp.write("%s\n" % day)
    print('Done')

Done


In [19]:
# Check again
scraped_days = os.listdir('../../../data/Tweets/tweets_scraped/total')

In [20]:
scraped_days = [day.replace('.csv', '') for day in scraped_days]
still_missing_days = [day for day in missing_days if day not in scraped_days]

In [21]:
still_missing_days

[]

Perfect. Everything is obtained till the specified date