In [1]:
import spacy
import pandas as pd
import numpy as np
import itertools
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import re

In [2]:
#pip install tensorflow

In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
df = pd.read_csv('trump_insult_tweets_2014_to_2021.csv',index_col='Unnamed: 0')
df.describe()

Unnamed: 0,date,target,insult,tweet
count,10360,10358,10360,10360
unique,1573,866,6729,5673
top,2020-10-12,the-media,Fake News,"The Fake News Networks, those that knowingly h..."
freq,45,1287,431,16


In [5]:
df['clean_tweets'] = df['tweet'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

In [6]:
clean_tweets = np.array(df['clean_tweets'][df['clean_tweets'] != ''].unique())

In [7]:
np.array([len(tweet) for tweet in clean_tweets]).argmin()

5254

In [8]:
np.sort(np.array([len(tweet) for tweet in df['clean_tweets'].unique()]))

array([  0,   8,  10, ..., 281, 283, 284])

In [9]:
docs = [nlp(x) for x in clean_tweets]
print(docs[:10])

[Can you believe this fool, Dr. Thomas Frieden of CDC, just stated, "anyone with fever should be asked if they have been in West Africa" DOPE, Big time in U.S. today - MAKE AMERICA GREAT AGAIN! Politicians are all talk and no action - they can never bring us back., Politician @SenatorCardin didn't like that I said Baltimore needs jobs & spirit. It's politicians like Cardin that have destroyed Baltimore., For the nonbeliever, here is a photo of @Neilyoung in my office and his $$ request—total hypocrite. , .@Neilyoung’s song, “Rockin’ In The Free World” was just one of 10 songs used as background music. Didn’t love it anyway., Uncomfortable looking NBC reporter Willie Geist calls me to ask for favors and then mockingly smiles when he is told of my high poll numbers, Just out, the new nationwide @FoxNews poll has me alone in 2nd place, closely behind Jeb Bush-but Bush will NEVER Make America Great Again!, The ratings for The View are really low. Nicole Wallace and Molly Sims are a disaste

In [10]:
tot_chars = [list(set(str(doc))) for doc in docs]
tot_chars = list(itertools.chain.from_iterable(tot_chars))
chars = sorted(list(set(tot_chars)))

char_to_num = dict((c, i) for i, c in enumerate(chars))
print(char_to_num)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, '=': 27, '?': 28, '@': 29, 'A': 30, 'B': 31, 'C': 32, 'D': 33, 'E': 34, 'F': 35, 'G': 36, 'H': 37, 'I': 38, 'J': 39, 'K': 40, 'L': 41, 'M': 42, 'N': 43, 'O': 44, 'P': 45, 'Q': 46, 'R': 47, 'S': 48, 'T': 49, 'U': 50, 'V': 51, 'W': 52, 'X': 53, 'Y': 54, 'Z': 55, '_': 56, 'a': 57, 'b': 58, 'c': 59, 'd': 60, 'e': 61, 'f': 62, 'g': 63, 'h': 64, 'i': 65, 'j': 66, 'k': 67, 'l': 68, 'm': 69, 'n': 70, 'o': 71, 'p': 72, 'q': 73, 'r': 74, 's': 75, 't': 76, 'u': 77, 'v': 78, 'w': 79, 'x': 80, 'y': 81, 'z': 82, '{': 83, '}': 84, '\x8f': 85, '\x9d': 86, '\xa0': 87, '¡': 88, '¦': 89, 'µ': 90, '·': 91, '¸': 92, 'º': 93, '½': 94, '¿': 95, 'Ä': 96, 'â': 97, 'é': 98, 'ï': 99, 'ð': 100, 'ô': 101, 'ö': 102, 'ù': 103, 'ú': 104, 'Œ': 105, 'Ÿ': 106, 'ž': 107, '˜': 108, '–': 109,

In [11]:
input_len = len(tot_chars)
vocab_len = len(chars)
print(input_len, vocab_len)

193290 121


In [12]:
seq_len = 2

In [13]:
X_data = []
y_data = []

In [14]:
for i in range(0, input_len - seq_len, 1):
    in_seq = tot_chars[i:i + seq_len]

    # Out sequence is the initial character plus total sequence length
    out_seq = tot_chars[i + seq_len]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    X_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
# for doc in docs:
#     for j in range(len(doc) - seq_len):
        
#         in_seq = doc[j : j + seq_len]
#         out_seq = doc[j + seq_len]
        
#         X_data = np.append(X_data, [twts_to_num[char] for char in in_seq])
#         y_data.append(doc.vocab.strings[out_seq.text])
        

In [16]:
# doc.vocab.strings[str(y_data[100])]

In [17]:
n_patterns = len(X_data)

In [18]:
n_patterns

193288

In [19]:
X = np.reshape(X_data, (n_patterns, seq_len, 1))
X = X/float(vocab_len)
y = np_utils.to_categorical(y_data)

In [20]:
# X = np.reshape(X_data, (n_patterns, seq_len, 1))
# X = X / float(len(doc.vocab))
# y_dum = pd.get_dummies(y_data)
# cols = y_dum.columns
# y = np.asarray(y_dum)
# print(cols, y)

In [22]:
print(cols)

NameError: name 'cols' is not defined

In [23]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [24]:
# filename = "model_weights_saved.hdf5"
# model.load_weights(filename)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
filepath = "model_weights_saved4.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 2, 256)            264192    
_________________________________________________________________
dropout (Dropout)            (None, 2, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 121)               15609     
Total params: 476,921
Trainable params: 476,921
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(X, y, epochs=50, batch_size=128, callbacks=desired_callbacks)

Epoch 1/50

Epoch 00001: loss improved from inf to 3.14287, saving model to model_weights_saved4.hdf5
Epoch 2/50

Epoch 00002: loss improved from 3.14287 to 2.00787, saving model to model_weights_saved4.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.00787 to 1.71559, saving model to model_weights_saved4.hdf5
Epoch 4/50

Epoch 00004: loss improved from 1.71559 to 1.57954, saving model to model_weights_saved4.hdf5
Epoch 5/50

Epoch 00005: loss improved from 1.57954 to 1.50282, saving model to model_weights_saved4.hdf5
Epoch 6/50

Epoch 00006: loss improved from 1.50282 to 1.44909, saving model to model_weights_saved4.hdf5
Epoch 7/50

Epoch 00007: loss improved from 1.44909 to 1.41471, saving model to model_weights_saved4.hdf5
Epoch 8/50

Epoch 00008: loss improved from 1.41471 to 1.38936, saving model to model_weights_saved4.hdf5
Epoch 9/50

Epoch 00009: loss improved from 1.38936 to 1.36803, saving model to model_weights_saved4.hdf5
Epoch 10/50

Epoch 00010: loss improved from 1.368

<tensorflow.python.keras.callbacks.History at 0x7fc0847330d0>

In [28]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [34]:
start = np.random.randint(0, len(X_data) - 1)
pattern = X_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" H? "


In [35]:
# pattern = [doc.vocab.strings['We'], doc.vocab.strings['love']]

In [36]:
pattern

[37, 28]

In [37]:
res = []

for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    res.append(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]
    
print("".join(res))

mr uvonwTgf.yaDtdkshASpliebCUIc!,Fmr uvonwTgf.yaDtdkshASpliebCUIc!,Fmr uvonwTgf.yaDtdkshASpliebCUIc!


In [38]:
for i in range(3):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(doc.vocab))
    
    prediction = model.predict(x, verbose=0)
    print(prediction)
    index = np.argmax(prediction[0])
    print(index)
    result = doc.vocab.strings[cols[index]]
    seq_in = [doc.vocab.strings[value] for value in pattern]
    # CONVERT INDEX TO COL NAME TO RESTORE SPACY HASH
    print(seq_in, "----", result)

    pattern.append(cols[index])
    pattern = pattern[1:len(pattern)]


NameError: name 'doc' is not defined

In [None]:
print(y)