# Imports

In [112]:
# Used to pull data from Reddit
import praw
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns
import statsmodels.formula.api
import matplotlib.pyplot as plt
from datetime import datetime


from gensim.models import Word2Vec
from nltk.corpus import brown
import nltk
import tensorflow as tf
import tensorflow_hub as hub
import multiprocessing
from google.cloud import bigquery

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

#nltk.download('punkt')

In [18]:
# Read .sql that pulls comments
fd = open('comments.sql', 'r')
comment_sql = fd.read()
fd.close()

# Put query results into df
comment_df = pd.read_gbq(comment_sql,
                         project_id='w266-240122',
                         dialect='standard')

# Convert date into proper date/time
comment_df['created_dt_tm'] = comment_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))

# Create field for month
comment_df['created_dt_month'] = comment_df['created_dt_tm'].dt.to_period('M').dt.to_timestamp()

In [4]:
#comment_df.to_csv('reddit_comments_2016_2019.csv')
# TODO: change dtype to string for body
comment_df = pd.read_csv('reddit_comments_2016_2019.csv', parse_dates=['created_dt_month'])

  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
comment_df['body'] = comment_df['body'].astype(str)

In [6]:
# Read .sql that pulls comments
fd = open('posts.sql', 'r')
post_sql = fd.read()
fd.close()

post_df = pd.read_gbq(post_sql,
                      project_id='w266-240122',
                      dialect='standard')

# Convert date into proper date/time
post_df['created_dt_tm'] = post_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))

# Create field for month
post_df['created_dt_month'] = post_df['created_dt_tm'].dt.to_period('M').dt.to_timestamp()

In [5]:
#post_df.to_csv('reddit_posts_2016_2019.csv')
post_df = pd.read_csv('reddit_posts_2016_2019.csv', parse_dates=['created_dt_month'])

# Functions

In [6]:
def preprocess_text(comment_df, post_df, subreddit, lower=False):
    
    # TODO remove symbols, contractions
    # TODO lowercase words?
    # TODO Treat numbers as something else
    # Add beginning and end of sentence?
    # TODO how do we deal with stop words?
    # UNK token
    
    # Body text from post dataframe
    post_text = list(post_df[(post_df['is_self'] == True) &
                             (~post_df['selftext'].isin(['[removed]', '[deleted]'])) &
                             post_df['subreddit'].isin(subreddit)]['selftext'].unique())
    
    # Add in text from title
    post_text += list(post_df[post_df['is_self'] == True]['title'].unique())
    
    # Add in text from post comments
    post_text += list(comment_df[(comment_df['subreddit'].isin(subreddit)) & 
                                     (~comment_df['body'].isin(['[removed]', '[deleted]']))]['body'])
    
    # Put all text into dataframe and drop dupes
    text_df = pd.DataFrame(post_text, columns=['text'])
    text_df.drop_duplicates(inplace=True)
    
    # Tokenize at sentence level
    text_df['sent_tokenized'] = text_df['text'].apply(nltk.sent_tokenize)
    text_df['sent_count'] = text_df['sent_tokenized'].apply(lambda x: len(x))

    # Put tokenized body into a list
    sent_list = list(text_df[text_df['sent_count'] > 0]['sent_tokenized'].apply(pd.Series).stack().unique())

    # Put list into dataframe
    sent_df = pd.DataFrame(sent_list, columns=['sentence'])
    
    # lowercase
    if lower == True:
        sent_df['sentence'] = sent_df['sentence'].str.lower()

    # Tokenize each sentence at the word level
    sent_df['word_token'] = sent_df['sentence'].apply(nltk.word_tokenize)
    
    return sent_df

In [7]:
def train_embedding(sent_df):
    
    # train word embedding
    embedding = Word2Vec(list(sent_df['word_token']),
                         size=100,
                         window=5, 
                         min_count=5, 
                         negative=15, 
                         iter=10, workers=6)
    
    return embedding

# Data PreProcessing

In [23]:
comment_df['score_sentiment'] = comment_df['score'].apply(lambda x: 'positive' if x > 0 
                                                          else 'negative' if x < 0 
                                                          else 'neutral')

In [24]:
comment_df.score_sentiment.value_counts()

positive    2306098
negative     207636
neutral      163302
Name: score_sentiment, dtype: int64

In [45]:
comment_df = comment_df[['subreddit', 'body', 'created_dt_month', 'score', 'score_sentiment']].copy()
comment_df = comment_df[~comment_df['body'].isin(['[removed]', '[deleted]'])].reset_index(drop=True)

In [152]:
sample_df = comment_df[comment_df['subreddit'].isin(['progressive', 'democrats'])][['body', 'score_sentiment']]
sample_df = sample_df[sample_df['score_sentiment'].isin(['positive', 'negative'])]
sample_df = sample_df.sample(n=5000)
sample_df.reset_index(drop=True, inplace=True)
sample_df['word_tokens'] = sample_df['body'].apply(nltk.word_tokenize)
sample_df['word_count'] = sample_df['word_tokens'].apply(lambda x: len(x))

# Basic EDA

In [139]:
sample_df['word_count'].describe()

count    1000.000000
mean       47.157000
std        68.699738
min         0.000000
25%        12.000000
50%        26.000000
75%        53.000000
max       817.000000
Name: word_count, dtype: float64

# Sample Data

In [153]:
sample_df['score_sentiment'] = sample_df['score_sentiment'].apply(lambda x: 0 if x == 'negative' else 1)

# split into train and test
train_df, test_df = train_test_split(sample_df, test_size=.25)

In [154]:
data_col = 'body'
label_col = 'score_sentiment'
label_list = [0, 1]

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train_df.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                                text_a = x[data_col],
                                                                                text_b = None, 
                                                                                label = x[label_col]), axis = 1)

test_InputExamples = test_df.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                              text_a = x[data_col], 
                                                                              text_b = None, 
                                                                              label = x[label_col]), axis = 1)

In [82]:
# Download BERT to a local directory
!mkdir bert_model

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1?tf-hub-format=compressed" | tar -zxvC bert_model

mkdir: cannot create directory ‘bert_model’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
./
./saved_model.pb
./variables/
./variables/variables.index
./variables/variables.data-00000-of-00001
100  388M  100  388M    0     0  66.5M      0  0:00:05  0:00:05 --:--:-- 71.5M
./tfhub_module.pb
./assets/
./assets/vocab.txt


In [155]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_model = hub.Module('bert_model')
        tokenization_info = bert_model(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info['vocab_file'],
                                                  tokenization_info['do_lower_case']])

    return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [156]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 64

# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, 
                                                                  label_list,
                                                                  MAX_SEQ_LENGTH,
                                                                  tokenizer)

test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples,
                                                                 label_list,
                                                                 MAX_SEQ_LENGTH,
                                                                 tokenizer)

In [157]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
    """Creates a classification model."""
    bert_module = hub.Module('bert_model', 
                             trainable=True)
    bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
    bert_outputs = bert_module(inputs=bert_inputs, signature="tokens", as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable("output_weights", 
                                     [num_labels, hidden_size],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        
        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)


In [158]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
        
        # TRAIN and EVAL
        if not is_predicting:
            (loss, predicted_labels, log_probs) = create_model(is_predicting,
                                                               input_ids, 
                                                               input_mask, 
                                                               segment_ids,
                                                               label_ids, 
                                                               num_labels)
            train_op = bert.optimization.create_optimizer(loss, 
                                                          learning_rate, 
                                                          num_train_steps,
                                                          num_warmup_steps, 
                                                          use_tpu=False)
            # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(
                    label_ids,
                    predicted_labels)
                auc = tf.metrics.auc(
                    label_ids,
                    predicted_labels)
                recall = tf.metrics.recall(
                    label_ids,
                    predicted_labels)
                precision = tf.metrics.precision(
                    label_ids,
                    predicted_labels) 
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)   
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)  
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {"eval_accuracy": accuracy,
                        "f1_score": f1_score,
                        "auc": auc,
                        "precision": precision,
                        "recall": recall,
                        "true_positives": true_pos,
                        "true_negatives": true_neg,
                        "false_positives": false_pos,
                        "false_negatives": false_neg}

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(is_predicting,
                                                         input_ids, 
                                                         input_mask,
                                                         segment_ids,
                                                         label_ids, 
                                                         num_labels)

            predictions = {'probabilities': log_probs,
                           'labels': predicted_labels}
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn

In [159]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [160]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [145]:
!mkdir model_output
OUTPUT_DIR = 'model_output'

mkdir: cannot create directory ‘model_output’: File exists


In [161]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [162]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [163]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [164]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0722 06:11:01.865738 139670156429120 deprecation.py:323] From /home/armand_kok/anaconda3/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
W0722 06:12:31.778995 139670156429120 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 78 vs previous value: 78. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
W0722 06:13:17.794290 139670156429120 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value

Training took time  0:27:52.699804


In [173]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [174]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


{'auc': 0.5,
 'eval_accuracy': 0.9208,
 'f1_score': 0.9587671,
 'false_negatives': 0.0,
 'false_positives': 99.0,
 'loss': 0.27497643,
 'precision': 0.9208,
 'recall': 1.0,
 'true_negatives': 0.0,
 'true_positives': 1151.0,
 'global_step': 351}

In [179]:
test_df['score_sentiment'].sum()/test_df.shape[0]

0.9208

In [None]:
def getPrediction(in_sentences):
  labels = ["Negative", "Positive"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

# Emebddings from Democratic Leaning Subreddits

### Text PreProcessing

In [208]:
# Empty list to store the embeddings
embedding_list = []

# Dates to iterate over
dates = comment_df.created_dt_month.sort_values().unique()[1:]

# Iterate for each month and create embedding for it
for d in dates:
    sent_df = preprocess_text(comment_df[comment_df['created_dt_month'] == d], 
                              post_df[post_df['created_dt_month'] == d],
                              ['democrats', 'progressive'], 
                              lower=False)
    word_embedding = train_embedding(sent_df)
    embedding_list.append(word_embedding)

In [310]:
sent_df = preprocess_text(comment_df, 
                          post_df,
                          ['democrats', 'progressive'], 
                          lower=False)
word_embedding = train_embedding(sent_df)

# ELMo Model

In [None]:
# Initialize elmo
elmo = hub.Module('module_elmo', trainable=True)
embeddings = elmo(['This is an example sentence that I want an embedding for'], signature="default", as_dict=True)['elmo']

# Get word embedding
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embeddings)

# Sentiment Model

In [7]:
def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('positive-words.txt')
neg_words = load_lexicon('negative-words.txt')

In [8]:
e = word_embedding
embedding_df = pd.DataFrame(e.wv.vectors)
embedding_df.index = e.wv.index2word

pos_vectors = embedding_df.reindex(index=pos_words).dropna()
neg_vectors = embedding_df.reindex(index=neg_words).dropna()

vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

NameError: name 'word_embedding' is not defined

In [312]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
model = SGDClassifier(loss='log', random_state=0, n_iter=100)
model.fit(train_vectors, train_targets)

print(accuracy_score(model.predict(test_vectors), test_targets))

0.7634854771784232




In [315]:
def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embedding_df.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)


# Show 20 examples from the test set
words_to_sentiment(test_labels).iloc[:10]

Unnamed: 0,sentiment
sabotage,-2.115922
undocumented,-2.074377
slowly,-0.855885
slaughter,-0.999751
debilitating,0.157101
dirt,-2.300752
mindless,-1.347088
strong,13.202846
nuisance,-0.612164
deceiving,-0.432331


In [316]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

In [317]:
text_to_sentiment('Trump')

-5.745474703785556

In [324]:
text_to_sentiment('Obama')

-1.1328254916227443

In [325]:
text_to_sentiment('Biden')

-0.2195910751765513

In [350]:
text_to_sentiment('Progressive')

25.283264240040666

In [357]:
text_to_sentiment('Mexican')

0.08479791049252694

In [371]:
e.wv.most_similar('Trump')

[('trump', 0.8514634370803833),
 ('Putin', 0.6282497644424438),
 ('he', 0.5907704830169678),
 ('Trumpski', 0.590567946434021),
 ('tRump', 0.579472541809082),
 ('him', 0.5785919427871704),
 ('Russia', 0.5719641447067261),
 ('Bernie', 0.5676500797271729),
 ('Whitaker', 0.5499702095985413),
 ('Obama', 0.5409168004989624)]

In [419]:
e.wv.most_similar('Donny')

[('tRump', 0.6996694207191467),
 ('wrist', 0.6700129508972168),
 ('stuffed', 0.6529507040977478),
 ('Bitch', 0.6497117280960083),
 ('trumpty', 0.6405702829360962),
 ('Pai', 0.6387597322463989),
 ('graham', 0.6387059688568115),
 ('Ajit', 0.638285219669342),
 ('tRUMP', 0.6369954347610474),
 ('Haley', 0.6360580325126648)]

In [420]:
e.wv.most_similar('Donald')

[('impeaching', 0.6418150067329407),
 ('Impeach', 0.6205568909645081),
 ('Supporters', 0.6122241020202637),
 ('Supporter', 0.6121823787689209),
 ('Tower', 0.5998998880386353),
 ('Organization', 0.5910106897354126),
 ('Pence', 0.5623512864112854),
 ('President', 0.5594371557235718),
 ('Fred', 0.5535253882408142),
 ('administration', 0.5455576777458191)]

### ELMo

In [20]:
# Download ELMo to a local directory
!mkdir module_elmo

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module_elmo

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  331M    0   415    0     0    369      0  10d 21h  0:00:01  10d 21h   369x assets/
x saved_model.pb
x tfhub_module.pb
x variables/
x variables/variables.index
100  331M  100  331M    0     0  2054k      0  0:02:45  0:02:45 --:--:-- 2379k 2614kk      0  0:02:59  0:01:26  0:01:33 2755k   0  2087k      0  0:02:42  0:01:43  0:00:59 3369k      0  0:02:41  0:01:51  0:00:50 2300k  0:01:56  0:00:44 2391k0:02:44  0:02:25  0:00:19 2082k



In [None]:
# Initialize elmo
elmo = hub.Module('module_elmo', trainable=True)
embeddings = elmo(['Trump thinks criticism of his misogynistic remarks is a "political correctness" problem. Nope, its a pervasive #sexism problem. #GOPDebate'], signature="default", as_dict=True)['elmo']

# Get word embedding
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embeddings)

Instructions for updating:
Colocations handled automatically by placer.


W0719 22:37:02.743443 4547675584 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0719 22:37:03.826928 4547675584 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [3]:
message_embeddings[0][0]

NameError: name 'message_embeddings' is not defined

# Questions
* Sentiment analysis labeled data
* How would we go about scoring the performance of our models?
    * I proposed to use the scores that are available on the reddit
* Should we train the sentiment classifier at a sentence or post level?
    * Each post may contain multiple sentences
