# Train a regressor on a set of embeddings of tweet texts

Use **GetOldTweets3** library (available via Pypi)

## Install required libraries

In [1]:
!pip install GetOldTweets3
import GetOldTweets3 as got



In [2]:
# !pip install basilica # might have to install, if not available in underlying environment

In [3]:
import pandas as pd
# import json
import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.decomposition
import basilica
import pickle
from pathlib import Path


## Configure

In [4]:
twitter_user_name = 'LambdaSchool'
count = 100 # during testing
API_KEY =  'get from local or env'

## Data Engineering

In [5]:
def tweet_to_dict(twt):
    """Munges a twt object into a dict, using names of attributes of
    object as keys in dict.
    'favorites' is a count of 'likes'
    'hashtags' is a string that is a space-separated series of hashtags
    'mentions' is a string that is a space-separated series of ats (@s)
    'urls' is a string that is a space-separated series of URLs
    """
    return {'date' : twt.date
            , 'favorites' : twt.favorites
            , 'formatted_date' : twt.formatted_date
            , 'geo' : twt.geo
            , 'hashtags' : twt.hashtags
            , 'id' : twt.id
            , 'mentions' : twt.mentions
            , 'permalink' : twt.permalink
            , 'replies' : twt.replies
            , 'retweets' : twt.retweets
            , 'text' : twt.text
            , 'to' : twt.to
            , 'urls' : twt.urls
            , 'username' : twt.username}   

def munge_date(dt):
    """Munges a datetime.datetime object into a dict, using names of attributes of
    object as keys in dict.
    'day_of_week' is [0-7] with 0 being 'Monday'
    'minute_of_day' is count of minutes from midnight"""
    return {'year' : dt.year 
            , 'month' : dt.month
            , 'day' : dt.day
            , 'day_of_week' : dt.weekday()
            , 'hour' : dt.hour
            , 'minute' : dt.minute
            , 'minute_of_day' : (60 * dt.hour) + dt.minute}


def join_dicts(got_tweet_object):
    """Returns a dict that is the result of joining 
    - a dict that is the result of parsing a GOT object
      to  dict, and
    - a dict that is the result of munging the a datetime.date
      into a dict."""
    return {**tweet_to_dict(got_tweet_object), **munge_date(got_tweet_object.date)}

def munge_tweet_objects(tweet_objects):
    return list(map(join_dicts, tweet_objects))


In [6]:
# get a set of tweets
pickled_fn = './r_tweets.pickle'
pickled_path = Path(pickled_fn)

# if a pickled file already exists, unpickle it
if pickled_path.is_file():
    merged_df = pd.read_pickle(pickled_fn)

# if a pickled file does not exist yet, get data then pickle it
else:  
    #  Create object to execute queries
    querySpecs = got.manager.TweetCriteria().setUsername(twitter_user_name).setMaxTweets(count)
   
    print('Retrieving tweets via GOT3')
    # retrieve tweets
    retrieved_tweets = got.manager.TweetManager.getTweets(querySpecs)
    
    tweet_dicts = munge_tweet_objects(retrieved_tweets)
    
#     y_retweets = pd.DataFrame.from_records(tweet_dicts,  columns=['retweets'])
#     y_retweets = y_retweets.fillna(0)

    y_likes = pd.DataFrame.from_records(tweet_dicts,  columns=['likes'])
    y_likes = y_likes.fillna(0)
    
    columns_not_needed = ['id', 'hashtags', 'replies', 'retweets', 'text', 'favorites', 'mentions',
       'to', 'urls', 'year', 'month', 'day', 'date', 'formatted_date', 'permalink', 'username', 'hour', 'minute', 'geo']
    times_df = pd.DataFrame.from_records(tweet_dicts,  exclude=columns_not_needed)

    # create a df of embeddings of the texts
    tweet_texts = [tweet.text for tweet in retrieved_tweets]
    print('retrieving embeddings via basilica')
    with basilica.Connection(API_KEY) as c:
        embeddings = list(c.embed_sentences(tweet_texts))
    
    print("Retrieved " + str(len(embeddings)) + " embeddings.")
    normalized_embeddings = sklearn.preprocessing.normalize(embeddings)
    colnames = ['embed_col' + str(i) for i in range(len(embeddings[0]))]
    normalized_embeddings_df = pd.DataFrame(normalized_embeddings, columns=colnames)  
    
    merged_df     =     pd.merge(times_df, normalized_embeddings_df, right_index=True, left_index=True)

    # pickle the df
    merged_df.to_pickle(pickled_fn)


Retrieving tweets via GOT3
retrieving embeddings via basilica
Retrieved 100 embeddings.


Index(['favorites', 'mentions', 'day_of_week', 'minute_of_day'], dtype='object')

In [8]:
normalized_embeddings_df

Unnamed: 0,embed_col0,embed_col1,embed_col2,embed_col3,embed_col4,embed_col5,embed_col6,embed_col7,embed_col8,embed_col9,...,embed_col758,embed_col759,embed_col760,embed_col761,embed_col762,embed_col763,embed_col764,embed_col765,embed_col766,embed_col767
0,0.015931,-0.009591,0.029823,0.010719,-0.038900,-0.038713,0.026972,0.032410,-0.018273,-0.024048,...,0.002214,-0.001712,0.011461,-0.002155,0.015545,0.014710,-0.031647,-0.012895,0.012229,0.031648
1,0.035164,0.002939,0.031583,0.014191,0.009272,-0.049518,0.027813,0.042434,0.000494,-0.030599,...,0.006243,-0.027250,-0.016545,-0.014361,0.017695,0.014158,-0.038322,0.005454,0.005988,0.040775
2,0.027223,-0.003076,0.022185,0.017983,-0.001771,-0.030092,0.004849,0.037733,0.006084,-0.035582,...,-0.002299,-0.004929,-0.005112,-0.022495,0.012315,-0.000232,-0.055358,-0.014222,0.009414,0.032656
3,0.044184,-0.020054,0.046462,0.018096,0.035194,-0.020867,-0.009356,0.017132,0.005850,-0.016406,...,0.003304,-0.035959,-0.019320,-0.006499,0.000992,0.008922,-0.023195,0.005795,-0.013351,0.052417
4,0.017101,-0.021225,0.032381,0.009887,0.023359,-0.043089,0.006115,0.049163,-0.016186,-0.034511,...,-0.002739,-0.010104,-0.018705,-0.023456,0.027306,0.025214,-0.018806,0.001975,0.015818,0.026857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.020511,-0.011412,0.019663,0.031424,-0.009619,-0.045547,0.011763,0.035282,-0.003485,-0.036165,...,0.005358,-0.005756,-0.021598,-0.015001,0.020247,0.016553,-0.021384,-0.018000,-0.000585,0.041988
96,0.038468,-0.029616,0.046740,0.030350,0.010551,-0.051023,0.009508,0.035810,0.025113,-0.055883,...,0.008719,-0.006314,-0.027820,-0.023555,0.024306,-0.001848,-0.009763,-0.004845,0.001076,0.013059
97,0.036078,-0.021617,0.036502,0.025404,0.016809,-0.037948,0.020509,0.006095,-0.000881,-0.045999,...,0.001761,-0.004897,-0.023133,-0.033046,0.025120,0.003318,-0.009817,-0.022853,0.014583,0.013084
98,0.034544,-0.026042,0.009093,0.024633,0.009390,-0.042280,0.032299,0.013476,0.001875,-0.037165,...,-0.000254,-0.010017,-0.031978,-0.029655,-0.004985,0.023196,-0.009142,-0.028463,0.009120,0.035746


In [None]:
type(y_likes.values[0])

In [None]:
# colnames = ['embed_col' + str(i) for i in range(len(embeddings[0]))]

# # put the normalized embeddings back in a dataframe
# normalized_embeddings_df = pd.DataFrame(normalized_embeddings, columns=colnames)
# # PCA the embeddings
# # sklearn.get_config()
# normalized_embeddings_df.shape

# normalized_embeddings_df.head()

## Split into X matrix (embeddings) and y vector (retweet or like count)

In [None]:
y_likes = merged_df['favorites']
# print(y_likes.shape)
# print(y_likes.isna().value_counts())
type(merged_df['favorites'].values[0])

## Training Regressors

In [None]:
# import numpy as np
# import sklearn.linear_model
# import sklearn.preprocessing
# import sklearn.model_selection

In [None]:
X = merged_df.values

X_train, X_test = sklearn.model_selection.train_test_split(X, random_state=72)


In [None]:
y_retweets_train, y_retweets_test = sklearn.model_selection.train_test_split(y_retweets, random_state=72)

y_likes_train, y_likes_test = sklearn.model_selection.train_test_split(y_likes, random_state=72)

y_likes
# X_train
# y_retweets
# retweets_model = sklearn.linear_model.LogisticRegression(max_iter=100)

# print(type(y_likes_train))
# print(type(y_likes_train.values))
# print(type(y_likes_train.values[0]))
# print(type(X_train.values[0]))
# y_likes_train = pd.DataFrame(y_likes_train)
# _likes_train
# X_train
# retweets_model.fit(X_train, y_likes_train)

In [None]:
likes_model = sklearn.linear_model.LogisticRegression(max_iter=100)
# likes_model.fit(X_train, y_likes_train)
X_train

## Results

In [None]:
print('Retweets Train accuracy: %.3f' % retweets_model.score(X_train, y_retweets_train))
print('Retweets Test accuracy: %.3f' % retweets_model.score(X_test, y_retweets_test))

print('Likes Train accuracy: %.3f' % likes_model.score(X_train, y_likes_train))
print('Likes Test accuracy: %.3f' % likes_model.score(X_test, y_likes_test))

In [None]:
# y_retweets[0:10]

In [None]:
def predict_retweets_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return retweets_model.predict(embdng)[0]

def predict_retweets_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return ('predicted retweets', 
            predict_retweets_one_embedding(embeddings_array[idx].reshape(1,-1)), 
            'actual', y_retweets[idx])


def predict_likes_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return likes_model.predict(embdng)[0]

def predict_likes_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return  ('predicted likes', 
             predict_likes_one_embedding(embeddings_array[idx].reshape(1,-1)),
             y_likes[idx])
             

foo = merged_df.values
predict_retweets_one_by_index(foo,5)
predict_likes_one_by_index(foo,5)