In [None]:
import json
import datetime as dt

In [None]:
import graphlab

# Lets explore the tweets retrieved

In [None]:
Tweets = graphlab.SFrame.read_json('tweets2.JSON')
graphlab.canvas.set_target('ipynb')

## Decide what data is relevant to Firebird 

In [None]:
Tweets['date_time'] = Tweets['created_at'].apply(lambda x: dt.datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y'))

In [None]:
Tweets['date_time'][0]

In [None]:
graphlab.canvas.set_target('ipynb')

In [None]:
Tweets.sort('retweet_count', ascending=False)

In [None]:
Tweets['word_count'] = graphlab.text_analytics.count_words(Tweets['text'], to_lower=True)

In [None]:
Tweets['bigram_count'] = graphlab.text_analytics.count_ngrams(Tweets['text'], n=2, to_lower=True, method='word', ignore_punct=True)

In [None]:
Tweets['trigram_count'] = graphlab.text_analytics.count_ngrams(Tweets['text'], n=3, to_lower=True, method='word', ignore_punct=True)

In [None]:
Tweets['#_of_hashtags'] = Tweets.apply(lambda x: len(x['entities']['hashtags']))

In [None]:
Tweets['hashtags'] = Tweets.apply(lambda x: x['entities']['hashtags'])

In [None]:
Tweets['#_of_links'] = Tweets.apply(lambda x: len(x['entities']['urls']))

In [None]:
Tweets['words_in_text'] = Tweets.apply(lambda x: len(x['word_count']))

### Get rid of Users with less than 100 followers and only unique Tweets

In [None]:
len(Tweets)

In [None]:
Tweets = Tweets[Tweets['followed_by'] > 100]

In [None]:
len(Tweets)

# Time to create the prediction model

## Create Training and test datasets

In [None]:
train_data,test_data = Tweets.random_split(.8, seed=0)

In [None]:
all_features = ['word_count',
          'bigram_count',
          'trigram_count',
          'friended_by',
          'followed_by',
          '#_of_lists',
          '#_of_hashtags',
          '#_of_links',
          'words_in_text']
gram_features = [ 'word_count', 'trigram_count', 'bigram_count' ]

In [None]:
tweet_ngram_model = graphlab.linear_regression.create(train_data,
                                                target='retweet_count',
                                                features=gram_features,
                                                validation_set=None,
                                                l1_penalty=.5,
                                                max_iterations=25)

In [None]:
coEF = tweet_ngram_model.get('coefficients')

In [None]:
coEf = coEF.sort('value', ascending=False)

In [None]:
coEf[coEf['name'] == 'word_count'].print_rows(50)

In [None]:
coEf[coEf['name'] == 'bigram_count']

In [None]:
coEf[coEf['name'] == 'trigram_count'].print_rows(30)