### Breaks down the extremely large tweet dataset into multiple small CSVs

In [1]:
import dask
import dask.dataframe as dd

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bitcoin-tweets-20160101-to-20190329/tweets.csv


In [6]:
df = dd.read_csv(
    '/kaggle/input/bitcoin-tweets-20160101-to-20190329/tweets.csv',
    delimiter=';',
    usecols=['user', 'fullname', 'timestamp', 'text'],
    engine='python',
    on_bad_lines='skip'
)

In [7]:
df.head()

Unnamed: 0,user,fullname,timestamp,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14+00,È appena uscito un nuovo video! LES CRYPTOMONN...
1,bitcointe,Bitcointe,2019-05-27 11:49:18+00,Cardano: Digitize Currencies; EOS https://t.co...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06+00,Another Test tweet that wasn't caught in the s...
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22+00,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23+00,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [8]:
df['timestamp'] = dd.to_datetime(df['timestamp'])

In [9]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, user to text
dtypes: datetime64[ns](1), object(3)

In [10]:
df_cleaned = df.dropna(subset=['timestamp'])

In [12]:
df.size.compute()

67561928

In [13]:
df_cleaned['timestamp'] = df_cleaned['timestamp'].dt.strftime('%m/%d/%Y')

In [14]:
df_filtered = df_cleaned[df_cleaned['timestamp'] > '01/01/2014']

In [15]:
df_filtered.head()

Unnamed: 0,user,fullname,timestamp,text
0,KamdemAbdiel,Abdiel kamdem,05/27/2019,È appena uscito un nuovo video! LES CRYPTOMONN...
1,bitcointe,Bitcointe,05/27/2019,Cardano: Digitize Currencies; EOS https://t.co...
2,3eyedbran,Bran - 3 Eyed Raven,05/27/2019,Another Test tweet that wasn't caught in the s...
3,DetroitCrypto,J. Scardina,05/27/2019,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,mmursaleen72,Muhammad Mursaleen,05/27/2019,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [16]:
df_sorted = df_filtered.set_index('timestamp')

In [17]:
df_sorted.head()

Unnamed: 0_level_0,user,fullname,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/01/2015,btcusd,Bitstamp: BTC to USD,$314.01 at 00:45 UTC [24h Range: $312.60 - $31...
01/01/2015,btcusd,Bitstamp: BTC to USD,$313.24 at 16:00 UTC [24h Range: $312.60 - $32...
01/01/2015,BTCtoUSD,Bitcoin to USD $,Current price: 315.92$ $BTCUSD $btc #bitcoin 2...
01/01/2015,airdroplite,Airdrop Lite,BTCTurk 756.3 TL Koinim 775 TL CampBx 330.00 $...
01/01/2015,BTCtoEUR,Bitcoin to EUR €,Current price: 261.02€ $BTCEUR $btc #bitcoin 2...


In [24]:
df_sorted.tail()

Unnamed: 0_level_0,user,fullname,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12/31/2018,fahmyeu,HF,There are 36 million millionaires today on Ear...
12/31/2018,coin_chart,仮想通貨チャート,2019/01/01 02:00\nBTC 412924円\nETH 14842.6円\nE...
12/31/2018,BSD_Masternode,Bitsend Masternodes,#cryptocurrency Price Analysis for #Bitsend #B...
12/31/2018,lambo_liza,tweet LIZA LIZUN LAMBO LIMBO price BOT,#LIZA #LAMBO price\n12-31 17:00(GMT)\n\n$LIZA\...
12/31/2018,cryptoc_cheap,いま注目の格安仮想通貨！,2019/01/01 00:30\n#Binance 格安コイン\n\n1位 #HOT 0....


In [21]:
df_repartitioned = df_sorted.repartition(partition_size='100MB')

In [23]:
df_repartitioned.to_csv('broken_sets/data-*.csv')

['/kaggle/working/broken_sets/data-000.csv',
 '/kaggle/working/broken_sets/data-001.csv',
 '/kaggle/working/broken_sets/data-002.csv',
 '/kaggle/working/broken_sets/data-003.csv',
 '/kaggle/working/broken_sets/data-004.csv',
 '/kaggle/working/broken_sets/data-005.csv',
 '/kaggle/working/broken_sets/data-006.csv',
 '/kaggle/working/broken_sets/data-007.csv',
 '/kaggle/working/broken_sets/data-008.csv',
 '/kaggle/working/broken_sets/data-009.csv',
 '/kaggle/working/broken_sets/data-010.csv',
 '/kaggle/working/broken_sets/data-011.csv',
 '/kaggle/working/broken_sets/data-012.csv',
 '/kaggle/working/broken_sets/data-013.csv',
 '/kaggle/working/broken_sets/data-014.csv',
 '/kaggle/working/broken_sets/data-015.csv',
 '/kaggle/working/broken_sets/data-016.csv',
 '/kaggle/working/broken_sets/data-017.csv',
 '/kaggle/working/broken_sets/data-018.csv',
 '/kaggle/working/broken_sets/data-019.csv',
 '/kaggle/working/broken_sets/data-020.csv',
 '/kaggle/working/broken_sets/data-021.csv',
 '/kaggle/

In [26]:
# Create downloadable zipped folder
!zip -r file.zip './broken_sets'

  adding: broken_sets/ (stored 0%)
  adding: broken_sets/data-038.csv (deflated 56%)
  adding: broken_sets/data-005.csv (deflated 69%)
  adding: broken_sets/data-065.csv (deflated 57%)
  adding: broken_sets/data-116.csv (deflated 58%)
  adding: broken_sets/data-079.csv (deflated 57%)
  adding: broken_sets/data-013.csv (deflated 57%)
  adding: broken_sets/data-099.csv (deflated 58%)
  adding: broken_sets/data-072.csv (deflated 57%)
  adding: broken_sets/data-043.csv (deflated 57%)
  adding: broken_sets/data-109.csv (deflated 58%)
  adding: broken_sets/data-064.csv (deflated 57%)
  adding: broken_sets/data-084.csv (deflated 58%)
  adding: broken_sets/data-069.csv (deflated 57%)
  adding: broken_sets/data-090.csv (deflated 57%)
  adding: broken_sets/data-070.csv (deflated 56%)
  adding: broken_sets/data-007.csv (deflated 76%)
  adding: broken_sets/data-052.csv (deflated 56%)
  adding: broken_sets/data-082.csv (deflated 58%)
  adding: broken_sets/data-080.csv (deflated 58%)
  adding: broke

In [27]:
ls

__notebook_source__.ipynb  [0m[01;34mbroken_sets[0m/  file.zip  [01;34mkaggle[0m/


In [28]:
from IPython.display import FileLink
FileLink(r'file.zip')