In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import re

In [2]:
df = pd.read_csv('trump_insult_tweets_2014_to_2021.csv',index_col='Unnamed: 0')
df.describe()

Unnamed: 0,date,target,insult,tweet
count,10360,10358,10360,10360
unique,1573,866,6729,5673
top,2020-10-12,the-media,Fake News,"The Fake News Networks, those that knowingly h..."
freq,45,1287,431,16


In [3]:
df['clean_tweets'] = df['tweet'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

In [4]:
clean_tweets = np.array(df['clean_tweets'][df['clean_tweets'] != ''].unique())

In [5]:
np.array([len(tweet) for tweet in clean_tweets]).argmin()

5254

In [6]:
clean_tweets = [x.replace('"','') for x in clean_tweets]

In [7]:
tokenizer = Tokenizer()
corpus = clean_tweets
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)
print(total_words)

10384


In [8]:
token_list = tokenizer.texts_to_sequences([corpus[0]])[0]
print(token_list)
print([corpus[0]])

[111, 45, 243, 27, 826, 1530, 4303, 5739, 6, 4304, 36, 706, 380, 19, 5740, 56, 16, 444, 73, 11, 17, 66, 7, 1384, 5741, 1217]
['Can you believe this fool, Dr. Thomas Frieden of CDC, just stated, anyone with fever should be asked if they have been in West Africa DOPE']


In [9]:

seq_len = 5
X_data = []
y_data = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(1, len(token_list)):
#         n_gram_sequence = token_list[:i+1]
#         input_sequences.append(n_gram_sequence)
        
    for j in range(len(token_list) - seq_len):
        
        in_seq = token_list[j : j + seq_len]
        out_seq = token_list[j + seq_len]
        X_data.append(in_seq)        
        y_data.append(out_seq)

In [35]:
X_data[:3]


False

In [11]:
n_patterns = len(X_data) #157566 seq_length = 5
pd.DataFrame(y_data).iloc[:,0].sort_values().unique()


array([    1,     2,     3, ..., 10381, 10383, 10384])

In [13]:
X = np.reshape(X_data, (n_patterns, seq_len, 1))
X = X/float(len(tokenizer.word_index.items()))
y_temp = pd.get_dummies(y_data)

def f(x):
    y_temp[x] = 0
    
[f(x) for x in list(range(1,total_words)) if (x not in y_temp.columns)]

y = np.asarray(y_temp)


In [14]:
X.shape, y.shape, len(tokenizer.word_index.items())

((157566, 5, 1), (157566, 10384), 10384)

In [15]:
print(y[0][1529])
len(X_data[0])

1


5

In [16]:
model = Sequential()
#model.add(Embedding(n_patterns+1, 100, input_length=max_sequence_len-1))
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(128))
#model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax')) #outputs a on hot encoded row 

In [17]:
filename = "model_weights_saved.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]


In [19]:
model.fit(X, y, epochs=10, batch_size=100, callbacks=desired_callbacks)

Epoch 1/10

Epoch 00001: loss improved from inf to 6.98056, saving model to model_weights_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 6.98056 to 6.79638, saving model to model_weights_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 6.79638 to 6.75881, saving model to model_weights_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 6.75881 to 6.65263, saving model to model_weights_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 6.65263 to 6.55623, saving model to model_weights_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 6.55623 to 6.46000, saving model to model_weights_saved.hdf5
Epoch 7/10

Epoch 00007: loss improved from 6.46000 to 6.35955, saving model to model_weights_saved.hdf5
Epoch 8/10

Epoch 00008: loss improved from 6.35955 to 6.26244, saving model to model_weights_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 6.26244 to 6.16538, saving model to model_weights_saved.hdf5
Epoch 10/10

Epoch 00010: loss improved from 6.16538 to 6.0

<tensorflow.python.keras.callbacks.History at 0x7f9016da7350>

In [20]:
pattern = [6,
  4304,
  36,
  706,
  380,]
print(pattern)
#model.summary()
x = np.reshape(pattern, (1,len(pattern), 1))
print(x.shape)

[6, 4304, 36, 706, 380]
(1, 5, 1)


In [21]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(word) for word in list_of_indices]
    return(words)

# Creating texts 
my_texts = sequence_to_text(pattern)
my_texts
#tokenizer.sequences_to_texts_generator([pattern])

['of', 'cdc', 'just', 'stated', 'anyone']

In [22]:
import random

In [41]:
pattern = [6,1,4304,36,706,]
#pattern = [random.randint(1,total_words) for x in range(5)]
out = [sequence_to_text([value]) for value in pattern]
for i in range(50):
    x = np.reshape(pattern, (1,len(pattern), 1))
    x = x/float(len(tokenizer.word_index.items())) #cast as float so it doesn't do int divid 
#    print(x.shape)
    prediction = model.predict(x, verbose=0)
    #print(prediction[0][:100])
    index = np.argmax(prediction[0]) + 1 #it goes zero to 10383 so add 1 for offset
    #print(index)
    print(sequence_to_text([index]))
    
  

    pattern.append(index)
    pattern = pattern[1:len(pattern)]
    seq_in = [sequence_to_text([value]) for value in pattern]
    #print(seq_in)
    out.append(sequence_to_text([index]))
print(out)


['to']
['the']
['people']
['news']
['media']
['the']
['the']
['united']
['states']
['and']
['the']
['american']
['left']
['hunt']
['hunt']
['the']
['the']
['democrats']
['of']
['the']
['united']
['states']
['and']
['the']
['american']
['left']
['hunt']
['hunt']
['the']
['the']
['democrats']
['of']
['the']
['united']
['states']
['and']
['the']
['american']
['left']
['hunt']
['hunt']
['the']
['the']
['democrats']
['of']
['the']
['united']
['states']
['and']
['the']
[['of'], ['the'], ['cdc'], ['just'], ['stated'], ['to'], ['the'], ['people'], ['news'], ['media'], ['the'], ['the'], ['united'], ['states'], ['and'], ['the'], ['american'], ['left'], ['hunt'], ['hunt'], ['the'], ['the'], ['democrats'], ['of'], ['the'], ['united'], ['states'], ['and'], ['the'], ['american'], ['left'], ['hunt'], ['hunt'], ['the'], ['the'], ['democrats'], ['of'], ['the'], ['united'], ['states'], ['and'], ['the'], ['american'], ['left'], ['hunt'], ['hunt'], ['the'], ['the'], ['democrats'], ['of'], ['the'], ['unite

In [32]:
print(prediction)

[[9.5076477e-03 7.0406152e-03 4.3313252e-03 ... 3.4010055e-09
  3.1294920e-09 3.4714198e-09]]


In [None]:
sequence_to_text([2])

In [None]:
#sequence_to_text(y_data)
#y_data

In [None]:
tokenizer.word_index