In [10]:
import requests      
from datetime import datetime
from bs4 import BeautifulSoup  
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import re, csv, string 
import gensim
from gensim import corpora, models

stop_words = stopwords.words('english')
keywords = ['bitcoin','cryptocurrency','cryptocurrencies', 'crypto', 'blockchain']

# wordnet - treebank map tagging system function
def get_wordnet_pos(pos_tag):
    # if pos tag starts with 'J'
    if pos_tag.startswith('J'):
        # return wordnet tag "ADJ"
        return wordnet.ADJ
    # if pos tag starts with 'V'
    elif pos_tag.startswith('V'):
        # return wordnet tag "VERB"
        return wordnet.VERB
    # if pos tag starts with 'N'
    elif pos_tag.startswith('N'):
        # return wordnet tag "NOUN"
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        # be default, return wordnet tag "NOUN"
        return wordnet.NOUN
    
    
# takes a string (document) as input and strips all tokens down to their root (lemmatized) form
def lemmatize(document):
    pattern=r'[a-zA-Z]+[a-zA-Z\-\.]'      
    tokens = nltk.regexp_tokenize(document, pattern)
    tagged_tokens = nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
    # get lemmatized tokens
    # lemmatize every word in tagged_tokens
    le_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) \
              # tagged_tokens is a list of tuples (word, tag)
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              # remove punctuations
              word not in string.punctuation]
    le_words = list(set(le_words))
    return le_words

def get_headline_list(filtered_data):
    headlines_list = []
    for row in filtered_data:
        headlines_list.append(row[0])
    return headlines_list

def tokenize_date(s):
    pattern=r'[a-z{3}]+[\.\s]+[\d{1,2}\,\s]+[\d{4}]+'                        
    tokens=nltk.regexp_tokenize(s, pattern)
    date = tokens
    return date;

def remove_keywords(data):
    for row in data:
        headline_test = row[0]
        for word in headline_test:
            if word in keywords:
                headline_test.remove(word)
    return data

def get_bloomberg_headlines():
    headlines=[]  # list variable to store headlines
    dates = []    # list variable to store dates
    page_number = 1
#     print 'scraping page %s' % page_number
    page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-10-13T04:36:58.003Z&page="+str(page_number)
    # loop until page 76
    while page_url!="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-10-13T04:36:58.003Z&page=76":
        page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-10-13T04:36:58.003Z&page="+str(page_number)        
        page = requests.get(page_url) 
        page_number += 1
#         print 'scraping page %s' % page_number
        if page.status_code!=200:  
            page_number = 76 #if page status code fails to equal 200, connection failed; set page_num to while loop condition
        else:                   
            soup = BeautifulSoup(page.content, 'html.parser')                        
            
#-----------scrape and clean all headlines and append to a list called headlines----------------------------------------------
            for header in soup.find_all('h1', class_ ='search-result-story__headline'):
                headline = header.get_text().lower()
                headlines.append(lemmatize(headline))
                
#-----------scrape all dates and append to a list called dates---------------------------------------------------------------
            for date in soup.find_all('time', class_ = 'published-at'):
                date_published = date.get_text()
                dates.append(date_published)
                 
#---join headlines list and dates list into a list of tuples called data-----------------------------------------------------
    data = zip(headlines, dates)

#---remove headlines without these keywords----------------------------------------------------------------------------------
    filtered_data = filter(lambda x: any(word in x[0] for word in keywords), data)
    
# this removes the above keywords from each headline to improve performance of the clustering algorithm
    for row in filtered_data:
        headline_test = row[0]
        for word in headline_test:
            if word in keywords:
                headline_test.remove(word)
    
    return filtered_data

def coindesk_market_parser():
    headlines=[]
    page_url= "https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/"
    while page_url!=None:
        page = requests.get(page_url) 
        if page.status_code!=200:
             page_url=None
        else:
            all_data = []
            for num in range(1,20):
                page_url = "https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/page/"+str(num)+"/"
                page = requests.get(page_url)
                soup = BeautifulSoup(page.content, "html.parser")
                titles = soup.find_all("a", class_ = "fade")
                titles_dictionary = []
                for i in titles:
                    title = i.get_text().lower()
                    titles_dictionary.append(lemmatize(title))
                dates = soup.find_all("time")
                dates_list = []
                for i in dates:
                    date = i.get_text().lower()
                    date = date.replace(' at', '')
                    date = datetime.strptime(date, '%b %d, %Y %H:%M')
                    date = datetime.strftime(date,'%b-%d-%Y')
                    dates_list.append(date)
                data = zip(titles_dictionary,dates_list)  
                all_data.extend(data)

        page_url = None
    return all_data

def coindesk_parser():    
    headlines=[]
    page_url= "https://www.coindesk.com/category/technology-news/bitcoin/"
    while page_url!=None:
        page = requests.get(page_url) 
        if page.status_code!=200:
             page_url=None
        else:
            all_data = []
            for num in range(1,50):
                page_url = "https://www.coindesk.com/category/technology-news/bitcoin/page/"+str(num)+"/"
                page = requests.get(page_url)
                soup = BeautifulSoup(page.content, "html.parser")
                titles = soup.find_all("a", class_ = "fade")
                titles_dictionary = []
                for i in titles:
                    title = i.get_text().lower()
                    titles_dictionary.append(lemmatize(title))
                dates = soup.find_all("time")
                dates_list = []
                for i in dates:
                    date = i.get_text().lower()
                    date = date.replace(' at', '')
                    date = datetime.strptime(date, '%b %d, %Y %H:%M')
                    date = datetime.strftime(date,'%b-%d-%Y')
                    dates_list.append(date)
                data = zip(titles_dictionary,dates_list)  
                all_data.extend(data)
        page_url = None
    return all_data


def marketwatch_parser():
    headlines=[]
    page_url= "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=500&mp=806&bd=false&bdv=&rs=false"
    while page_url!=None:
        page = requests.get(page_url) 
        if page.status_code!=200:
             page_url=None
        else:
            all_data = []
            page_url = "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=500&mp=806&bd=false&bdv=&rs=false"
            page = requests.get(page_url)
            soup = BeautifulSoup(page.content, "html.parser")
            divs = soup.find_all("div", class_ = "searchresult")
            titles_dictionary = []
            for idx, div in enumerate(divs):
                titles = div.select("a")
                if titles != []:
                    title = titles[0].get_text().lower()
                    titles_dictionary.append(lemmatize(title))
            divs_dates = soup.find_all("div", class_ = "deemphasized")
            dates_list = []
            for idx, div in enumerate(divs_dates):
                date = div.get_text().lower()
                date = str(tokenize_date(date)[0])
                dates_list.append(date)
            data = zip(titles_dictionary,dates_list)  
            all_data.extend(data)
        page_url = None

        filtered_data = filter(lambda x: any(word in x[0] for word in keywords), all_data)
    return filtered_data

def get_bitcoinNews_headlines():
    headlines=[]  # list variable to store headlines
    dates = []    # list variable to store dates
    page_number = 1
#     print 'scraping page %s' % page_number
    page_url="https://news.bitcoin.com/page/"+str(page_number)+"/?s=bitcoin"
    # page 115 as parameter
    while page_url!="https://news.bitcoin.com/page/115/?s=bitcoin":
        page_url="https://news.bitcoin.com/page/"+str(page_number)+"/?s=bitcoin"        
        page = requests.get(page_url) 
        page_number += 1
#         print 'scraping page %s' % page_number
        if page.status_code!=200:  
            page_number = None #if page status code fails to equal 200, connection failed; set page_num to while loop condition
        else:                   
            soup = BeautifulSoup(page.content, 'html.parser')                        
            
#-----------scrape and clean all headlines and append to a list called headlines----------------------------------------------
            main_content = soup.find('div', class_ = 'td-ss-main-content')            
            h3s = main_content.find_all('h3', class_ = "entry-title td-module-title")
            for idx, h3 in enumerate(h3s):
                header = h3.select('a')
                if header != []:
                    headline = header[0].get_text().lower()
                    headlines.append(lemmatize(headline))

#-----------scrape all dates and append to a list called dates---------------------------------------------------------------
            span_dates = main_content.find_all("span", class_ = "td-post-date")
            for idx, span in enumerate(span_dates):
                dates_list = span.select('time')
                if dates_list != []:
                    date = dates_list[0].get_text()
                    dates.append(date)
    
#---join headlines list and dates list into a list of tuples called data-----------------------------------------------------
    data = zip(headlines, dates)
    
    return data




if __name__ == "__main__":
    bloomberg = get_bloomberg_headlines()
    print("done scraping bloomberg")
    coindesk_market = coindesk_market_parser()
    print("done scraping coindesk market")
    coindesk = coindesk_parser()
    print("done scraping coindesk")
    marketwatch = marketwatch_parser()
    print("done scraping marketwatch")
    bitcoinNews = get_bitcoinNews_headlines()
    print("done scraping bitcoin news")

    all_data = []
    all_data.extend(bloomberg + coindesk_market + coindesk + marketwatch + bitcoinNews)
    all_data = remove_keywords(all_data)
#     print(all_data)
    
    headline_list = get_headline_list(all_data)
    dictionary = corpora.Dictionary(headline_list)
    corpus = [dictionary.doc2bow(row) for row in headline_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=20)
    print(ldamodel.print_topics(num_topics=5, num_words=5))


done scraping bloomberg
done scraping coindesk market
done scraping coindesk
done scraping marketwatch
done scraping bitcoin news
[(0, u'0.019*"etf" + 0.012*"sec" + 0.007*"company" + 0.007*"government" + 0.006*"mainstream"'), (1, u'0.022*"new" + 0.010*"china" + 0.009*"transaction" + 0.008*"network" + 0.008*"launch"'), (2, u'0.027*"exchange" + 0.016*"bank" + 0.014*"mining" + 0.013*"new" + 0.010*"bitcoin"'), (3, u'0.022*"price" + 0.011*"get" + 0.008*"pay" + 0.007*"market" + 0.007*"set"'), (4, u'0.019*"market" + 0.017*"high" + 0.014*"time" + 0.012*"price" + 0.008*"say"')]
