# To run this, you need to run (or have run) the following in docker:
```
pip install textblob
pip install nltk
pip install twitterscraper
pip install pandas_datareader
pip install yahoo-finance
```

In [7]:
from twitterscraper import query_tweets
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import seaborn as sns
import hypertools as hyp
import numpy as np
from textblob import TextBlob as tb
import pandas_datareader as pdr
import pandas as pd
import datetime as dt
from yahoo_finance import Share
import nltk
nltk.download('brown')
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Define some useful Twitter-related functions

- Find most recent tweets containing a given keyword
- Fit topic models to a set of tweets
- Do sentiment analyses on tweets
- Get the tweet text and dates

In [8]:
# function for scraping twitter for one or more keywords and returning a dictionary with:
# - tweets: the tweet text (list of length n_tweets)
# - datetimes: the tweet date/time (as a DateTime object)
# - topicvecs: the tweet topic vectors (numpy array with n_tweets rows and n_topics columns)
# - topwords: the top n words from each topic (list of length n_topics, where each element is a list of n_words)
# - sentiments: the sentiment valence of each tweet (numpy array of length n_tweets)

def twitter_witch(keywords, n_tweets=500, n_topics=10, n_words=5, model=None):
    #if keywords is a list, combine all keywords into a single string, where each word is separated by " OR "
    if type(keywords) == list:
        keywords = ' OR '.join(keywords)
    
    #get the tweets
    tweets = []
    for tweet in query_tweets(keywords, n_tweets)[:n_tweets]:
        tweets.append(tweet)
    
    #get the tweet text
    tweet_text = list(map(lambda x: x.text, tweets))
    
    #fit a topic model to the tweet text    
    n_features = 1000
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(tweet_text)    
    vocab = tf_vectorizer.get_feature_names()
    
    if model == None:
        lda = LDA(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf)
    else:
        lda = model            
    tweet_topics = lda.fit(tf)
    
    def get_top_words(model, vocab, n_words):
        top_words = []
        for topic_idx, topic in enumerate(model.components_):       
            next = topic.argsort()[:-n_words - 1:-1]
            top_words.append(list(map(lambda x: vocab[x], next)))
        return top_words
    
    def tweet_sentiment(tweet):
        b = tb(tweet)
        return np.sum(np.array(list(map(lambda x: x.sentiment.polarity, b.sentences))))
    
    #get the tweet datetimes
    tweet_dts = list(map(lambda x: x.timestamp, tweets))
    
    return {'tweets': tweet_text,
            'datetimes': tweet_dts,
            'topicvecs': lda.components_.T,
            'topwords': get_top_words(lda, vocab, n_words),
            'sentiments': np.array(list(map(tweet_sentiment, tweet_text))),
            'model': lda}

# Define some useful finance-related functions

Given a stock ticker symbol and a date, return a dictionary with the following keys/values (strings or floats, or None if unavailable):
- name: The company name
- open: The opening price from that day
- close: The closing price from that day
- vol: The trading volume from that day
- price_change: The change in price from the previous day, in whatever the trading currency is
- percent_change: The change in price from the previous day, as a percentage
- currency: The currency (e.g. USD)

In [3]:
def date2str(date):
    return dt.datetime.strftime(date, '%Y-%m-%d')

In [4]:
def finance_wizard(name, date):
    x = Share(name.upper())
    info = {'name': None,
            'open': None,
            'close': None,
            'vol': None,
            'price_change': None,
            'percent_change': None,
            'currency': None}
            
    info['name'] = x.get_name()
    
    if info['name'] == None: #ticker symbol not found
        return info
    
    data = pdr.data(name.upper(), 'yahoo-actions', date - dt.timedelta(1), date)
    
    
    
    

In [5]:
x = pdr.get_data_yahoo('AAPL')

In [9]:
tweets = twitter_witch('apple', 100)

datetime.datetime(2017, 10, 7, 18, 24, 24)

In [15]:
data = pdr.data.DataReader('AAPL', 'yahoo-actions', tweets['datetimes'][0] - dt.timedelta(2), tweets['datetimes'][0] - dt.timedelta(1))

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [13]:
x.get_name()

'Apple Inc.'

In [14]:
x.get_open()

'154.97'

In [18]:
m = Share('asdlfkj')

In [20]:
m.get_name() == None

True

In [69]:
tweets['datetimes'][0] - dt.timedelta(1)

datetime.datetime(2017, 10, 6, 18, 2, 51)

In [72]:
x.get_historical('2017-10-05', '2017-10-07')

YQLResponseMalformedError: Response malformed.

In [67]:
historical = x.get_historical(date2str(tweets['datetimes'][0] - dt.timedelta(2)), date2str(tweets['datetimes'][0]))

YQLResponseMalformedError: Response malformed.

In [73]:
import pandas as pd

In [None]:
pd.get