In [1]:
import os
import pandas as pd
from langdetect import detect_langs
from langdetect import detect_langs, LangDetectException
import numpy as np
# consisten results for language detection
from langdetect import DetectorFactory
DetectorFactory.seed = 0

In [38]:
# Information about the coin-datasets
def coin_details(coin_data :dict) -> str:
    for k,v in coin_data.items():
        print(k)
        print('Unique dates:', v.date.nunique())
        min_date = min(v.date)
        max_date = max(v.date)
        years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
        print(f'Articles over {years} Years')
        print('\n')

In [39]:
# Detect text language
def detect_languages(df: pd.DataFrame, column: str) -> pd.Series:
    def detect_language(text: str) -> str:
        try:
            language = detect_langs(text)
        except LangDetectException as e:
            return "unknown"
        else:
            return language[0].lang
    return df[column].apply(detect_language)

In [4]:
BASE_DIR =os.path.dirname(os.path.dirname(os.getcwd()))
DATA_DIR_ARTICLE = os.path.join(BASE_DIR, 'Datasets\Article')

# Coindesk data

In [5]:
article = 'article_content_coindesk.json'
path = os.path.join(DATA_DIR_ARTICLE, article)

In [6]:
coindesk_df = pd.read_json(path)

In [7]:
coindesk_df.head()

Unnamed: 0,date,coin,section,title,text,url
0,"Dec 5, 2022 at 12:40 p.m. UTC",bitcoin,Features,The Punk Fighting for an Open Metaverse,"Punk6529 is living the future, today. At least...",https://www.coindesk.com/consensus-magazine/20...
1,"Mar 29, 2022 at 2:01 p.m. UTC",bitcoin,Business,"Prime Trust, Swan Bitcoin Link Up on Crypto IR...",Fintech firm Prime Trust on Tuesday announce...,https://www.coindesk.com/business/2022/03/29/p...
2,"Jan 12, 2021 at 9:08 a.m. UTC",bitcoin,Markets,Bitcoin Whales Kept Accumulating During Monday...,"Large bitcoin (BTC) investors, popularly known...",https://www.coindesk.com/markets/2021/01/12/bi...
3,"Sep 27, 2022 at 1:22 p.m. UTC",bitcoin,Markets,"First Mover Americas: Bitcoin Regains $20K, Bu...",Price Point: Bitcoin has managed to regain th...,https://www.coindesk.com/markets/2022/09/27/fi...
4,"Jan 27, 2017 at 4:55 p.m. UTC",bitcoin,Features,Avoiding Catastrophe: Researchers Face Blockch...,stanford 'Catastrophe' may have been the word ...,https://www.coindesk.com/markets/2017/01/27/av...


In [8]:
coindesk_df.tail()

Unnamed: 0,date,coin,section,title,text,url
27831,"Jul 3, 2020 at 10:09 a.m. UTC",cardano,Business,Cardano Developer IOHK Launches $20M Fund for ...,Cardano developer house IOHK has set up a $20 ...,https://www.coindesk.com/business/2020/07/03/c...
27832,"Jul 30, 2020 at 8:14 a.m. UTC",cardano,Tech,Cardano Introduces Proof-of-Stake With 'Shelle...,It's alive! Cardano's blockchain has undergone...,https://www.coindesk.com/tech/2020/07/30/carda...
27833,"Apr 22, 2021 at 7:10 p.m. UTC",cardano,Business,21Shares Launching Stellar and Cardano ETPs on...,Switzerland-based investment product provider ...,https://www.coindesk.com/business/2021/04/22/2...
27834,"Jul 3, 2020 at 2:06 p.m. UTC",cardano,Markets,Coinbase Custody to Support Secure Cardano Sta...,Cardano holders will soon be able to stake tok...,https://www.coindesk.com/markets/2020/07/03/co...
27835,"May 28, 2021 at 1:55 p.m. UTC",cardano,Markets,21Shares to List Three Crypto ETPs on Euronext...,Switzerland-based investment product provider ...,https://www.coindesk.com/markets/2021/05/28/21...


In [9]:
coindesk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27836 entries, 0 to 27835
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     27835 non-null  object
 1   coin     27836 non-null  object
 2   section  27721 non-null  object
 3   title    27835 non-null  object
 4   text     27836 non-null  object
 5   url      27836 non-null  object
dtypes: object(6)
memory usage: 1.3+ MB


In [10]:
coindesk_df.shape

(27836, 6)

In [11]:
#Unique values
print(coindesk_df.nunique().sort_values(ascending=False))

url        27836
title      27833
text       27819
date       27457
section       18
coin           5
dtype: int64


In [12]:
print('Coins:', coindesk_df.coin.unique())
print('Section:', coindesk_df.section.unique())

Coins: ['bitcoin' 'xrp' 'ethereum' 'binance' 'cardano']
Section: ['Features' 'Business' 'Markets' 'Opinion' 'Policy' 'News Analysis' 'Tech'
 'Reports' 'Layer 2' None 'Interview' 'Consensus Magazine' 'Profile'
 'Web3' 'Sports Week' 'CoinDesk' 'Learn' 'Events' 'Mining Week']


## Clean Coindesk data

In [13]:
coindesk_abt = coindesk_df.copy()

In [14]:
coindesk_abt = coindesk_abt.rename(columns={'date':'datetime'})

In [15]:
coindesk_abt['source'] = 'coindesk'

In [16]:
coindesk_abt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27836 entries, 0 to 27835
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  27835 non-null  object
 1   coin      27836 non-null  object
 2   section   27721 non-null  object
 3   title     27835 non-null  object
 4   text      27836 non-null  object
 5   url       27836 non-null  object
 6   source    27836 non-null  object
dtypes: object(7)
memory usage: 1.5+ MB


In [17]:
# Checking for duplicates
coindesk_abt.duplicated().any()

False

In [18]:
# Checking for missing values
coindesk_abt.isna().any()

datetime     True
coin        False
section      True
title        True
text        False
url         False
source      False
dtype: bool

Feature: text

In [19]:
mask = coindesk_abt[coindesk_abt.text==""].index
coindesk_abt = coindesk_abt.drop(mask, axis=0)

Feature: title

In [20]:
# Delete row with missing title
mask = coindesk_abt[coindesk_abt.title.isna()]
coindesk_abt = coindesk_abt.drop(mask.index, axis=0)

Feature: section 

In [21]:
# Check for missing section
coindesk_abt[coindesk_abt.section.isna()]

Unnamed: 0,datetime,coin,section,title,text,url,source
181,"Updated Aug 1, 2022 at 9:04 p.m. UTC",bitcoin,,NFT Marketplaces: A Beginner’s Guide,Do you keep hearing about people buying digita...,https://www.coindesk.com/tech/2021/07/12/nft-m...,coindesk
256,"Jun 2, 2022",bitcoin,,Staking Claims: Abra’s CEO Talks Passive Income,Bill Barhydt is the kind of smart person other...,https://www.coindesk.com/sponsored-content/sta...,coindesk
339,"Dec 16, 2021",bitcoin,,Why Gamification Will Drive Wider Blockchain A...,"Once Vitalik Buterin’s trilemma is solved, w...",https://www.coindesk.com/sponsored-content/why...,coindesk
1801,"Aug 28, 2021",bitcoin,,Proof of Stake: A stake through Proof of Work’...,Proof of Work (PoW) isn’t dead. It isn’t even ...,https://www.coindesk.com/sponsored-content/pro...,coindesk
1986,"Nov 22, 2021",bitcoin,,Why Digital Asset Adoption in Commerce Is Gain...,A number of large companies have been adding s...,https://www.coindesk.com/sponsored-content/why...,coindesk
...,...,...,...,...,...,...,...
27302,"Apr 12, 2022",ethereum,,Why We Need a True Cross-Chain NFT Marketplace...,"Price drops, regulatory crackdowns and geopoli...",https://www.coindesk.com/sponsored-content/why...,coindesk
27368,"Dec 17, 2022",ethereum,,Webinar Recap: Crypto Industry in 2022,"In the fallout from FTX , there’s been no sho...",https://www.coindesk.com/sponsored-content/web...,coindesk
27594,"Aug 25, 2022",ethereum,,"‘Cauldron’ of Connections Unify Gaming, DeFi a...",Ecosystem as well as technology unites WEMIX’s...,https://www.coindesk.com/sponsored-content/cau...,coindesk
27610,"Aug 25, 2022",ethereum,,Starfish Finance II - Attack of the (Dapp) Clones,Clones Far beyond Earth and millions of miles ...,https://www.coindesk.com/sponsored-content/sta...,coindesk


In [22]:
# Check urls of rows with missing section -> Sponsored content | no section | Crypto Explainer+
coindesk_abt[coindesk_abt.section.isna()].url.tolist()

['https://www.coindesk.com/tech/2021/07/12/nft-marketplaces-a-beginners-guide/',
 'https://www.coindesk.com/sponsored-content/staking-claims-abras-ceo-talks-passive-income/',
 'https://www.coindesk.com/sponsored-content/why-gamification-will-drive-wider-blockchain-adoption/',
 'https://www.coindesk.com/sponsored-content/proof-of-stake-a-stake-through-proof-of-works-heart/',
 'https://www.coindesk.com/sponsored-content/why-digital-asset-adoption-in-commerce-is-gaining-momentum/',
 'https://www.coindesk.com/sponsored-content/near-predictions-developers-chart-the-bridge-to-professional-freedom/',
 'https://www.coindesk.com/tech/2022/02/10/the-top-ethereum-killers-compared/',
 'https://www.coindesk.com/sponsored-content/itrustcapital-ensuring-quality-asset-listings/',
 'https://www.coindesk.com/sponsored-content/demystifying-decentralization-part-2/',
 'https://www.coindesk.com/sponsored-content/dont-settle-for-government-surveillance-coins/',
 'https://www.coindesk.com/powered-by-consensu

In [23]:
# Delete sponsored content, keep other articles they have important information for SentAn
pattern = r'https://www.coindesk.com/sponsored-content/'
mask = coindesk_abt[coindesk_abt.url.str.contains(pattern)].index
coindesk_abt = coindesk_abt.drop(mask, axis=0)

In [24]:
# Check links
coindesk_abt[coindesk_abt.section.isna()].url.tolist()

['https://www.coindesk.com/tech/2021/07/12/nft-marketplaces-a-beginners-guide/',
 'https://www.coindesk.com/tech/2022/02/10/the-top-ethereum-killers-compared/',
 'https://www.coindesk.com/powered-by-consensus/consensus-history/',
 'https://www.coindesk.com/powered-by-consensus/ftx-collapse-crypto-investors-retreat/',
 'https://www.coindesk.com/markets/como-identificar-y-protegerte-de-estafas-piramidales-realizadas-con-criptomonedas/',
 'https://www.coindesk.com/business/2021/02/04/what-is-uniswap-a-complete-beginners-guide/',
 'https://www.coindesk.com/powered-by-consensus/transforming-digital-economy/',
 'https://www.coindesk.com/markets/2018/09/09/crypto-trading-101-the-moving-average-crossover/',
 'https://www.coindesk.com/powered-by-consensus/most-influential-blockchain-projects/',
 'https://www.coindesk.com/web3/top-brands-in-web3-nfts-and-the-metaverse/',
 'https://www.coindesk.com/tech/2021/08/17/what-is-a-semi-fungible-crypto-token/',
 'https://www.coindesk.com/powered-by-conse

In [25]:
# Create section powered by consensus for 'https://www.coindesk.com/powered-by-consensus/...' urls
pattern = r'powered-by-consensus' #'Powered by Consensus'
coindesk_abt.loc[coindesk_abt.url.str.contains(pattern),'section'] = 'Powered by Consensus'

In [26]:
# Check links
coindesk_abt[coindesk_abt.section.isna()].url.tolist()

['https://www.coindesk.com/tech/2021/07/12/nft-marketplaces-a-beginners-guide/',
 'https://www.coindesk.com/tech/2022/02/10/the-top-ethereum-killers-compared/',
 'https://www.coindesk.com/markets/como-identificar-y-protegerte-de-estafas-piramidales-realizadas-con-criptomonedas/',
 'https://www.coindesk.com/business/2021/02/04/what-is-uniswap-a-complete-beginners-guide/',
 'https://www.coindesk.com/markets/2018/09/09/crypto-trading-101-the-moving-average-crossover/',
 'https://www.coindesk.com/web3/top-brands-in-web3-nfts-and-the-metaverse/',
 'https://www.coindesk.com/tech/2021/08/17/what-is-a-semi-fungible-crypto-token/',
 'https://www.coindesk.com/web3/gamestop-nft-marketplace-a-beginners-guide/',
 'https://www.coindesk.com/tech/2021/02/09/what-is-the-erc-20-ethereum-token-standard/',
 'https://www.coindesk.com/business/2021/03/25/your-nft-tax-questions-answered/']

In [27]:
# Create section Crypto Explainer+ 
coindesk_abt.loc[coindesk_abt.section.isna(),'section'] = 'Crypto Explainer+'

In [28]:
coindesk_abt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27722 entries, 0 to 27835
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  27722 non-null  object
 1   coin      27722 non-null  object
 2   section   27722 non-null  object
 3   title     27722 non-null  object
 4   text      27722 non-null  object
 5   url       27722 non-null  object
 6   source    27722 non-null  object
dtypes: object(7)
memory usage: 1.7+ MB


Feature: date

In [29]:
# Check for wrong dates and drop them
mask = coindesk_abt[~coindesk_abt.datetime.str.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Sep|Aug|Oct|Nov|Dec)\s\w{1,2},\s\w{4}')]
coindesk_abt = coindesk_abt.drop(mask.index, axis=0)

In [30]:
# Convert dtype of datetime derivate  date and time column and drop datetime
coindesk_abt.datetime.str.replace(r'[,;.at]', '', regex=True)
coindesk_abt.datetime = pd.to_datetime(coindesk_abt.datetime, infer_datetime_format=True, utc=True)
coindesk_abt['date'] = coindesk_abt.datetime.dt.date
coindesk_abt['time'] = coindesk_abt.datetime.dt.time
coindesk_abt = coindesk_abt.drop(['datetime'], axis = 1)

In [31]:
coindesk_abt.isna().any()

coin       False
section    False
title      False
text       False
url        False
source     False
date       False
time       False
dtype: bool

In [32]:
coindesk_abt.duplicated().any()

False

In [33]:
coindesk_abt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27709 entries, 0 to 27835
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   coin     27709 non-null  object
 1   section  27709 non-null  object
 2   title    27709 non-null  object
 3   text     27709 non-null  object
 4   url      27709 non-null  object
 5   source   27709 non-null  object
 6   date     27709 non-null  object
 7   time     27709 non-null  object
dtypes: object(8)
memory usage: 1.9+ MB


In [34]:
coindesk_abt.sample(5)

Unnamed: 0,coin,section,title,text,url,source,date,time
3201,bitcoin,Markets,Market Wrap: Bitcoin Traders Take Profits Amid...,It’s been a quiet week in crypto markets as pr...,https://www.coindesk.com/markets/2021/09/17/ma...,coindesk,2021-09-17,20:28:00
16861,bitcoin,Markets,Meme Tokens Led ‘Uptober’ as SHIB Mooned 765%,Popular meme tokens saw large gains in October...,https://www.coindesk.com/markets/2021/11/01/me...,coindesk,2021-11-01,15:12:00
22814,bitcoin,Markets,Vietnam Is Preparing to Legally Recognize Bitcoin,Vietnam's prime minister has approved a plan t...,https://www.coindesk.com/markets/2017/08/25/vi...,coindesk,2017-08-25,13:00:00
24863,ethereum,Markets,"Ether, Ethereum Classic See Mini Price Swing A...",Crypto markets are mostly flattish over the pa...,https://www.coindesk.com/markets/2022/09/15/et...,coindesk,2022-09-15,12:16:00
271,bitcoin,Markets,Bitcoin Again Falters at $40K as Ether Retreat...,Bitcoin and ether erased Wednesday's gains ...,https://www.coindesk.com/markets/2021/08/05/bi...,coindesk,2021-08-05,11:12:00


In [35]:
print('Stardate:', min(coindesk_abt.date))
print('Enddate:', max(coindesk_abt.date))

Stardate: 2013-04-01
Enddate: 2022-12-30


## Split Coindesk data

In [36]:
# Create df for bitcoin articles
coindesk_btc_articles = coindesk_abt[coindesk_abt.coin=='bitcoin'].reset_index(drop=True)
coindesk_xrp_articles = coindesk_abt[coindesk_abt.coin=='xrp'].reset_index(drop=True)
coindesk_eth_articles = coindesk_abt[coindesk_abt.coin=='ethereum'].reset_index(drop=True)
coindesk_bnb_articles = coindesk_abt[coindesk_abt.coin=='binance'].reset_index(drop=True)
coindesk_ada_articles = coindesk_abt[coindesk_abt.coin=='cardano'].reset_index(drop=True)

In [40]:
coindesk_data = {'BTC': coindesk_btc_articles , 
                 'XRP': coindesk_xrp_articles,
                 'ETH' : coindesk_eth_articles,
                 'BNB': coindesk_bnb_articles,
                 'ADA': coindesk_ada_articles}
coin_details(coindesk_data)

BTC
Unique dates: 3402
Articles over 9.746948944879088 Years


XRP
Unique dates: 278
Articles over 9.563509175410859 Years


ETH
Unique dates: 1527
Articles over 7.222598684435684 Years


BNB
Unique dates: 134
Articles over 4.690034702971314 Years


ADA
Unique dates: 63
Articles over 5.889237972032281 Years




Result: We need more data for XRP, BNB and ADA

# Utoday data

In [41]:
article = 'article_content_utoday.json'
path = os.path.join(DATA_DIR_ARTICLE, article)

In [42]:
utoday_df = pd.read_json(path)

In [43]:
utoday_df.head()

Unnamed: 0,date,coin,title,text,url
0,2022-12-12 14:48:00,bitcoin,Bitcoin (BTC) Macro Risk/Reward Indicators Loo...,"Bitcoin (BTC), the largest cryptocurrency, is ...",https://u.today/bitcoin-btc-macro-riskreward-i...
1,2022-12-12 09:14:00,bitcoin,Investors Are Getting Back into Bitcoin at Hig...,According to crypto analytics portal Santimen...,https://u.today/investors-are-getting-back-int...
2,2022-12-12 16:02:00,bitcoin,"Bitcoin Is Hitting Bottom, David Gokhshtein Sa...",Former U.S. congressional candidate and curren...,https://u.today/bitcoin-is-hitting-bottom-davi...
3,2022-12-13 06:22:00,bitcoin,Yardeni Research Says Crypto Collapse Hasn’t H...,"According to Yardeni Research , the collapse ...",https://u.today/yardeni-research-says-crypto-c...
4,2022-12-13 18:37:00,bitcoin,Congressman Brad Sherman Trashes Crypto During...,During a recent congressional hearing devoted ...,https://u.today/congressman-brad-sherman-trash...


In [81]:
utoday_df['source'] = 'utoday'

In [82]:
utoday_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8658 entries, 0 to 8657
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    8658 non-null   datetime64[ns]
 1   coin    8658 non-null   object        
 2   title   8658 non-null   object        
 3   text    8658 non-null   object        
 4   url     8658 non-null   object        
 5   source  8658 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 406.0+ KB


In [44]:
# Check for missing values
utoday_df.isna().any()

date     False
coin     False
title    False
text     False
url      False
dtype: bool

In [45]:
# Check for duplicates
utoday_df.duplicated().any()

False

In [46]:
# Unique Values
utoday_df.nunique()

date     8650
coin        4
title    8658
text     8658
url      8658
dtype: int64

In [47]:
utoday_df.coin.unique()

array(['bitcoin', 'cardano', 'ripple', 'ethereum'], dtype=object)

In [48]:
# Make copy and change date name to datetime
utoday_abt = utoday_df.rename(columns={'date':'datetime'})

In [49]:
utoday_abt.head()

Unnamed: 0,datetime,coin,title,text,url
0,2022-12-12 14:48:00,bitcoin,Bitcoin (BTC) Macro Risk/Reward Indicators Loo...,"Bitcoin (BTC), the largest cryptocurrency, is ...",https://u.today/bitcoin-btc-macro-riskreward-i...
1,2022-12-12 09:14:00,bitcoin,Investors Are Getting Back into Bitcoin at Hig...,According to crypto analytics portal Santimen...,https://u.today/investors-are-getting-back-int...
2,2022-12-12 16:02:00,bitcoin,"Bitcoin Is Hitting Bottom, David Gokhshtein Sa...",Former U.S. congressional candidate and curren...,https://u.today/bitcoin-is-hitting-bottom-davi...
3,2022-12-13 06:22:00,bitcoin,Yardeni Research Says Crypto Collapse Hasn’t H...,"According to Yardeni Research , the collapse ...",https://u.today/yardeni-research-says-crypto-c...
4,2022-12-13 18:37:00,bitcoin,Congressman Brad Sherman Trashes Crypto During...,During a recent congressional hearing devoted ...,https://u.today/congressman-brad-sherman-trash...


In [50]:
# Convert dtype of datetime, derivate date and time column and drop datetime
utoday_abt['date'] = utoday_abt.datetime.dt.date
utoday_abt['time'] = utoday_abt.datetime.dt.time
utoday_abt = utoday_abt.drop('datetime', axis=1)

In [51]:
# Check unique coins
utoday_df.coin.unique()

array(['bitcoin', 'cardano', 'ripple', 'ethereum'], dtype=object)

In [52]:
# Create coin dfs
utoday_btc_articles = utoday_abt[utoday_abt.coin=='bitcoin'].reset_index(drop=True)
utoday_xrp_articles = utoday_abt[utoday_abt.coin=='ripple'].reset_index(drop=True)
utoday_eth_articles = utoday_abt[utoday_abt.coin=='ethereum'].reset_index(drop=True)
utoday_ada_articles = utoday_abt[utoday_abt.coin=='cardano'].reset_index(drop=True)

In [53]:
utoday_data = {
    'BTC' : utoday_btc_articles , 
    'XRP' : utoday_xrp_articles,
    'ETH' : utoday_eth_articles,
    'ADA' : utoday_ada_articles}
coin_details(utoday_data)

BTC
Unique dates: 1298
Articles over 4.895377728495452 Years


XRP
Unique dates: 945
Articles over 4.895377728495452 Years


ETH
Unique dates: 767
Articles over 4.895377728495452 Years


ADA
Unique dates: 578
Articles over 4.506594933503084 Years




# Cointelegraph data

In [54]:
article = 'article_content_cointelegraph.json'
path = os.path.join(DATA_DIR_ARTICLE, article)

In [55]:
# Create cointelegraph article df
cointelegraph_articles = pd.read_json(path)

In [56]:
cointelegraph_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9118 entries, 0 to 9117
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     9069 non-null   object
 1   coin     9118 non-null   object
 2   section  8791 non-null   object
 3   title    9068 non-null   object
 4   text     9118 non-null   object
 5   url      9118 non-null   object
dtypes: object(6)
memory usage: 427.5+ KB


In [57]:
# Make copy and change date name to datetime
cointelegraph_abt = cointelegraph_articles.copy()
cointelegraph_abt = cointelegraph_abt.rename(columns={'date':'datetime'})

In [58]:
cointelegraph_abt['source'] = 'cointelegraph'

In [59]:
cointelegraph_abt.isna().any()

datetime     True
coin        False
section      True
title        True
text        False
url         False
source      False
dtype: bool

In [60]:
# Drop rows without date
mask = cointelegraph_abt[cointelegraph_abt.datetime.isna()].index
cointelegraph_abt = cointelegraph_abt.drop(mask, axis=0)
mask = cointelegraph_abt[cointelegraph_abt.title.isna()].index
cointelegraph_abt = cointelegraph_abt.drop(mask, axis=0)

In [61]:
# Remove leading and trailing space
cointelegraph_abt.datetime = cointelegraph_abt.datetime.str.strip()
cointelegraph_abt.text = cointelegraph_abt.text.str.strip()

In [62]:
# Check for wrong dates and drop them
mask = cointelegraph_abt[~cointelegraph_abt.datetime.str.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2},\s\d{4}')].index
cointelegraph_abt = cointelegraph_abt.drop(mask, axis=0)

In [63]:
# Convert dtype of datetime, derivate date and time column and drop datetime
cointelegraph_abt.datetime = pd.to_datetime(cointelegraph_abt.datetime, infer_datetime_format=True, utc=True)
cointelegraph_abt['date'] = cointelegraph_abt.datetime.dt.date
cointelegraph_abt['time'] = cointelegraph_abt.datetime.dt.time
cointelegraph_abt = cointelegraph_abt.drop('datetime', axis=1)

In [64]:
print('Coin:', cointelegraph_abt.coin.unique())
print('Section:', cointelegraph_abt.section.unique())

Coin: ['bitcoin' 'xrp' 'binance' 'cardano']
Section: [' Price Analysis ' ' Analysis ' ' News ' ' Market Update '
 ' Markets News ' ' Interview ' ' Market Analysis ' ' Newsletter '
 ' BlockShow ' ' Opinion ' None ' New Year Special ' ' Sponsored '
 ' How to crypto ' ' Altcoin Watch ' ' Expert Take ' ' Event Recap '
 ' Event ' ' Follow up ' ' Research ' ' Breaking news ' ' Video '
 ' Announcement ' ' Use Case ' ' Experts Answer ' ' Review ' ' Spotlight '
 ' Explained ' ' Podcast ' ' AUDIO ' ' Thought Leaders ' ' Investigation '
 ' Regulations ' ' Overview ' ' Adoption ' ' Local News ' ' Infographics '
 ' Unitize ' ' In depth ' ' Hodler’s Digest ' ' Profile ' ' Recap ']


In [65]:
# Set section for missing sections
cointelegraph_abt[cointelegraph_abt.section.isna()]
cointelegraph_abt[cointelegraph_abt.section.isna()].url.tolist()
cointelegraph_abt.loc[cointelegraph_abt.section.isna(),'section'] = 'Other'

In [66]:
# Create coin dfs
cointelegraph_btc_articles = cointelegraph_abt[cointelegraph_abt.coin=='bitcoin'].reset_index(drop=True)
cointelegraph_xrp_articles = cointelegraph_abt[cointelegraph_abt.coin=='xrp'].reset_index(drop=True)
cointelegraph_eth_articles = cointelegraph_abt[cointelegraph_abt.coin=='binance'].reset_index(drop=True)
cointelegraph_ada_articles = cointelegraph_abt[cointelegraph_abt.coin=='cardano'].reset_index(drop=True)
cointelegraph_bnb_articles = cointelegraph_abt[cointelegraph_abt.coin=='binance'].reset_index(drop=True)

In [67]:
cointelegraph_data = {
    'BTC' : cointelegraph_btc_articles , 
    'XRP' : cointelegraph_xrp_articles,
    'ETH' : cointelegraph_eth_articles,
    'ADA' : cointelegraph_ada_articles,
    'BNB' : cointelegraph_bnb_articles}
coin_details(cointelegraph_data)

BTC
Unique dates: 765
Articles over 2.0944988603462087 Years


XRP
Unique dates: 1080
Articles over 9.2678152186561 Years


ETH
Unique dates: 126
Articles over 3.8358077167908995 Years


ADA
Unique dates: 267
Articles over 4.848833309376648 Years


BNB
Unique dates: 126
Articles over 3.8358077167908995 Years




# Concat coin dfs

## Bitcoin 

In [68]:
btc_articles = pd.concat([coindesk_btc_articles, utoday_btc_articles,cointelegraph_btc_articles])
btc_articles.date = pd.to_datetime(btc_articles.date, infer_datetime_format=True)
btc_articles = btc_articles.sort_values('date').reset_index(drop=True)
btc_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34465 entries, 0 to 34464
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   coin     34465 non-null  object        
 1   section  30598 non-null  object        
 2   title    34465 non-null  object        
 3   text     34465 non-null  object        
 4   url      34465 non-null  object        
 5   source   30598 non-null  object        
 6   date     34465 non-null  datetime64[ns]
 7   time     34465 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 2.1+ MB


In [69]:
btc_articles.isna().any()

coin       False
section     True
title      False
text       False
url        False
source      True
date       False
time       False
dtype: bool

In [70]:
btc_articles.loc[btc_articles.section.isna(), 'section'] = 'No Section'

In [71]:
btc_articles['language']= detect_languages(btc_articles,'text')

KeyboardInterrupt: 

In [114]:
btc_articles = btc_articles[btc_articles.language =='en']

In [115]:
btc_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34193 entries, 0 to 34464
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   coin      34193 non-null  object        
 1   section   34193 non-null  object        
 2   title     34193 non-null  object        
 3   text      34193 non-null  object        
 4   url       34193 non-null  object        
 5   source    34193 non-null  object        
 6   date      34193 non-null  datetime64[ns]
 7   time      34193 non-null  object        
 8   language  34193 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 2.6+ MB


In [116]:
btc_articles.sample(5)

Unnamed: 0,coin,section,title,text,url,source,date,time,language
8922,bitcoin,Business,NY Regulator Argues BitLicense Regulation Boos...,The New York State Department of Financial Ser...,https://www.coindesk.com/business/2018/04/12/n...,coindesk,2018-04-12,16:30:00,en
3912,bitcoin,Markets,T-Mobile Poland Trials Bitcoin Top-Ups for Mob...,T-Mobile Poland customers can now purchase pre...,https://www.coindesk.com/markets/2015/02/25/t-...,coindesk,2015-02-25,15:20:00,en
28551,bitcoin,Business,Bitcoin Miner TeraWulf Sets 2022 Hashrate Guid...,"TeraWulf (WULF), the environmentally minded bi...",https://www.coindesk.com/business/2022/03/17/b...,coindesk,2022-03-17,23:01:00,en
4246,bitcoin,Markets,Consensus 2015: DOJ's Kathryn Haun to Discuss ...,Consensus 2015 The Silk Road case will make he...,https://www.coindesk.com/markets/2015/05/27/co...,coindesk,2015-05-27,10:28:00,en
25290,bitcoin,Opinion,Contango Conmigo: Why a Bitcoin Futures ETF Co...,"Since at least 2013, when the Winklevoss twins...",https://www.coindesk.com/policy/2021/10/20/con...,coindesk,2021-10-20,16:21:00,en


In [117]:
# Unqiue dates
print('Unique dates:', btc_articles.date.nunique())
min_date = min(btc_articles.date)
max_date = max(btc_articles.date)
years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
print('Min date', min_date)
print('Max date', max_date)
print(f'Articles over {years} Years')

Unique dates: 3467
Min date 2013-04-01 00:00:00
Max date 2022-12-30 00:00:00
Articles over 9.746948944879088 Years


## XRP

In [None]:
xrp_articles = pd.concat([coindesk_xrp_articles, utoday_xrp_articles, cointelegraph_xrp_articles])
xrp_articles.date = pd.to_datetime(xrp_articles.date, infer_datetime_format=True)
xrp_articles = xrp_articles.sort_values('date').reset_index(drop=True)
xrp_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3991 entries, 0 to 3990
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   coin     3991 non-null   object        
 1   section  1987 non-null   object        
 2   title    3991 non-null   object        
 3   text     3991 non-null   object        
 4   url      3991 non-null   object        
 5   source   3991 non-null   object        
 6   date     3991 non-null   datetime64[ns]
 7   time     3991 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 249.6+ KB


In [None]:
xrp_articles.isna().any()

coin       False
section     True
title      False
text       False
url        False
source     False
date       False
time       False
dtype: bool

In [None]:
xrp_articles.loc[xrp_articles.section.isna(), 'section'] = 'No Section'

In [None]:
xrp_articles['language']= detect_languages(xrp_articles,'text')

In [None]:
xrp_articles = xrp_articles[xrp_articles.language=='en']

In [None]:
xrp_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3990 entries, 0 to 3990
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   coin      3990 non-null   object        
 1   section   3990 non-null   object        
 2   title     3990 non-null   object        
 3   text      3990 non-null   object        
 4   url       3990 non-null   object        
 5   source    3990 non-null   object        
 6   date      3990 non-null   datetime64[ns]
 7   time      3990 non-null   object        
 8   language  3990 non-null   object        
dtypes: datetime64[ns](1), object(8)
memory usage: 311.7+ KB


In [None]:
# Unqiue dates
print('Unique dates:', xrp_articles.date.nunique())
min_date = min(xrp_articles.date)
max_date = max(xrp_articles.date)
years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
print('Min date', min_date)
print('Max date', max_date)
print(f'Articles over {years} Years')

Unique dates: 1703
Min date 2013-06-05 00:00:00
Max date 2022-12-28 00:00:00
Articles over 9.563509175410859 Years


## Ethereum

In [None]:
eth_articles = pd.concat([coindesk_eth_articles, utoday_eth_articles, cointelegraph_eth_articles])
eth_articles.date = pd.to_datetime(eth_articles.date, infer_datetime_format=True)
eth_articles = eth_articles.sort_values('date').reset_index(drop=True)
eth_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   coin     5171 non-null   object        
 1   section  3680 non-null   object        
 2   title    5171 non-null   object        
 3   text     5171 non-null   object        
 4   url      5171 non-null   object        
 5   source   5171 non-null   object        
 6   date     5171 non-null   datetime64[ns]
 7   time     5171 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 323.3+ KB


In [None]:
eth_articles.isna().any()

coin       False
section     True
title      False
text       False
url        False
source     False
date       False
time       False
dtype: bool

In [None]:
eth_articles.loc[eth_articles.section.isna(), 'section'] = 'No Section'

In [None]:
eth_articles['language']= detect_languages(eth_articles,'text')

In [None]:
eth_articles = eth_articles[eth_articles.language=='en']

In [None]:
eth_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5129 entries, 0 to 5170
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   coin      5129 non-null   object        
 1   section   5129 non-null   object        
 2   title     5129 non-null   object        
 3   text      5129 non-null   object        
 4   url       5129 non-null   object        
 5   source    5129 non-null   object        
 6   date      5129 non-null   datetime64[ns]
 7   time      5129 non-null   object        
 8   language  5129 non-null   object        
dtypes: datetime64[ns](1), object(8)
memory usage: 400.7+ KB


In [None]:
eth_articles.sample(5)

Unnamed: 0,coin,section,title,text,url,source,date,time,language
35,ethereum,Markets,Digital Currency Exchange Gatecoin Offline Aft...,UPDATE (14th May 11:40 BST): This piece has b...,https://www.coindesk.com/markets/2016/05/13/di...,coindesk,2016-05-13,22:50:00,en
4588,ethereum,Markets,Large Ether Traders Position for Volatility Sp...,Taking unhedged or hedged directional bets on ...,https://www.coindesk.com/markets/2022/08/26/la...,coindesk,2022-08-26,12:23:00,en
1982,ethereum,Business,"Signature Bank Gains $1B Deposits in Q3, With ...",The Takeaway: Deposits at crypto-friendly Sign...,https://www.coindesk.com/business/2020/10/20/s...,coindesk,2020-10-20,13:51:00,en
4277,ethereum,No Section,Ethereum's Arbitrum to Be Used by Norwegian Go...,"For the first time, a major country has begun ...",https://u.today/ethereums-arbitrum-to-be-used-...,utoday,2022-06-25,16:45:00,en
403,ethereum,Features,Brazil's Central Bank Is Ramping Up Blockchain...,The Central Bank of Brazil is building with ju...,https://www.coindesk.com/markets/2017/11/13/br...,coindesk,2017-11-13,13:00:00,en


In [None]:
# Unqiue dates
print('Unique dates:', eth_articles.date.nunique())
min_date = min(eth_articles.date)
max_date = max(eth_articles.date)
years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
print('Min date', min_date)
print('Max date', max_date)
print(f'Articles over {years} Years')

Unique dates: 1755
Min date 2015-10-06 00:00:00
Max date 2022-12-28 00:00:00
Articles over 7.228074498449661 Years


## Cardano

In [None]:
ada_articles = pd.concat([coindesk_ada_articles, utoday_ada_articles, cointelegraph_ada_articles])
ada_articles.date = pd.to_datetime(ada_articles.date, infer_datetime_format=True)
ada_articles = ada_articles.sort_values('date').reset_index(drop=True)
ada_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1662 entries, 0 to 1661
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   coin     1662 non-null   object        
 1   section  366 non-null    object        
 2   title    1662 non-null   object        
 3   text     1662 non-null   object        
 4   url      1662 non-null   object        
 5   source   1662 non-null   object        
 6   date     1662 non-null   datetime64[ns]
 7   time     1662 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 104.0+ KB


In [None]:
ada_articles.isna().any()

coin       False
section     True
title      False
text       False
url        False
source     False
date       False
time       False
dtype: bool

In [None]:
ada_articles.loc[ada_articles.section.isna(), 'section'] = 'No Section'

In [None]:
ada_articles['language']= detect_languages(ada_articles,'text')

In [None]:
ada_articles = ada_articles[ada_articles.language=='en']

In [None]:
ada_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1661 entries, 0 to 1661
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   coin      1661 non-null   object        
 1   section   1661 non-null   object        
 2   title     1661 non-null   object        
 3   text      1661 non-null   object        
 4   url       1661 non-null   object        
 5   source    1661 non-null   object        
 6   date      1661 non-null   datetime64[ns]
 7   time      1661 non-null   object        
 8   language  1661 non-null   object        
dtypes: datetime64[ns](1), object(8)
memory usage: 129.8+ KB


In [None]:
ada_articles.sample(5)

Unnamed: 0,coin,section,title,text,url,source,date,time,language
114,cardano,No Section,Cardano (ADA) Releases Upgraded Byron Codebase...,"The Byron 'reboot', which will be available on...",https://u.today/cardano-ada-releases-upgraded-...,utoday,2020-03-27,14:46:00,en
546,cardano,No Section,Hoskinson Talks Digitization with Ethiopian Mi...,"On his Cardano tour taking place in Africa, co...",https://u.today/hoskinson-talks-digitization-w...,utoday,2021-10-28,12:53:00,en
402,cardano,No Section,Cardano (ADA) Developers Introduce New Token T...,Cardano Foundation pioneers the concept of Non...,https://u.today/cardano-ada-developers-introdu...,utoday,2021-08-11,15:26:00,en
599,cardano,No Section,ADA Price Approaches Critical Threshold as Car...,Cardano Foundation lauds 13 major notable col...,https://u.today/ada-price-approaches-critical-...,utoday,2021-12-27,13:52:00,en
23,cardano,News,"Crypto Markets See Solid Upswing, Bitcoin Pus...","Crypto markets are seeing mostly green today, ...",https://cointelegraph.com/news/crypto-markets-...,cointelegraph,2018-07-21,00:00:00,en


In [None]:
# Unqiue dates
print('Unique dates:', ada_articles.date.nunique())
min_date = min(ada_articles.date)
max_date = max(ada_articles.date)
years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
print('Min date', min_date)
print('Max date', max_date)
print(f'Articles over {years} Years')

Unique dates: 756
Min date 2017-01-03 00:00:00
Max date 2022-12-28 00:00:00
Articles over 5.98232681026989 Years


## Binance

In [None]:
bnb_articles = pd.concat([coindesk_bnb_articles, cointelegraph_bnb_articles])
bnb_articles.date = pd.to_datetime(bnb_articles.date, infer_datetime_format=True)
bnb_articles = bnb_articles.sort_values('date').reset_index(drop=True)
bnb_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   coin     285 non-null    object        
 1   section  285 non-null    object        
 2   title    285 non-null    object        
 3   text     285 non-null    object        
 4   url      285 non-null    object        
 5   source   285 non-null    object        
 6   date     285 non-null    datetime64[ns]
 7   time     285 non-null    object        
dtypes: datetime64[ns](1), object(7)
memory usage: 17.9+ KB


In [None]:
bnb_articles['language']= detect_languages(bnb_articles,'text')

In [None]:
bnb_articles = bnb_articles[bnb_articles.language == 'en']

In [None]:
bnb_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284 entries, 0 to 284
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   coin      284 non-null    object        
 1   section   284 non-null    object        
 2   title     284 non-null    object        
 3   text      284 non-null    object        
 4   url       284 non-null    object        
 5   source    284 non-null    object        
 6   date      284 non-null    datetime64[ns]
 7   time      284 non-null    object        
 8   language  284 non-null    object        
dtypes: datetime64[ns](1), object(8)
memory usage: 22.2+ KB


In [None]:
bnb_articles.sample(5)

Unnamed: 0,coin,section,title,text,url,source,date,time,language
189,binance,Altcoin Watch,Here’s why Binance Coin is 33% down from its ...,"Binance Coin ( BNB ) holders enjoyed a 1,760% ...",https://cointelegraph.com/news/here-s-why-bina...,cointelegraph,2022-01-19,00:00:00,en
152,binance,Altcoin Watch,Binance Coin regains 20% in a day: Why is BNB...,"The price of Binance Coin ( BNB ), the native ...",https://cointelegraph.com/news/binance-coin-re...,cointelegraph,2021-04-19,00:00:00,en
244,binance,Opinion,A Stablecoin Law May Not Happen This Year,Hey folks. Monday was a federal holiday in the...,https://www.coindesk.com/policy/2022/09/07/a-s...,coindesk,2022-09-07,17:30:00,en
148,binance,News,Messari researchers slam Binance Smart Chain ...,Despite Binance supporters celebrating Binance...,https://cointelegraph.com/news/messari-researc...,cointelegraph,2021-04-12,00:00:00,en
5,binance,Markets,Malta Passes Trio of Bills as Part of 'Blockch...,Malta has taken a significant step toward beco...,https://www.coindesk.com/markets/2018/06/27/ma...,coindesk,2018-06-27,14:45:00,en


In [None]:
# Unqiue dates
print('Unique dates:', bnb_articles.date.nunique())
min_date = min(bnb_articles.date)
max_date = max(bnb_articles.date)
years = (max_date - min_date).total_seconds() / (365.2425 * 24 * 60 * 60)
print('Min date', min_date)
print('Max date', max_date)
print(f'Articles over {years} Years')

Unique dates: 246
Min date 2018-04-15 00:00:00
Max date 2022-12-26 00:00:00
Articles over 4.69824842399228 Years


## Save cleaned dfs

In [None]:
dataframes = [btc_articles, xrp_articles, eth_articles, ada_articles, bnb_articles]
filenames = ['btc_articles.csv', 'xrp_articles.csv', 'eth_articles.csv', 'ada_articles.csv', 'bnb_articles.csv']

for df, filename in zip(dataframes, filenames):
    filepath = os.path.join(BASE_DIR, 'Datasets', filename)
    df.to_csv(filepath, index=False)