In [19]:
import pandas as pd

from keras.preprocessing import sequence
from keras.preprocessing import text

from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_json('transcripts.json')

In [8]:
df.head()

Unnamed: 0,date,num_tokens,president,transcript
0,2019-09-25,7132,Donald Trump,PRESIDENT TRUMP: Thank you very much. Thank ...
1,2019-09-24,3842,Donald Trump,PRESIDENT TRUMP: Thank you very much. Mr. Pr...
2,2019-02-15,8894,Donald Trump,"THE PRESIDENT: Thank you very much, everybody..."
3,2019-02-05,5144,Donald Trump,"Madam Speaker, Mr. Vice President, Members of ..."
4,2019-01-19,1516,Donald Trump,"THE PRESIDENT: Just a short time ago, I had th..."


In [40]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for president in df['president'].unique():
    pres_df = df.loc[df['president'] == president]
    
    if len(pres_df) > 5:
        train, test = train_test_split(pres_df, test_size=0.2)
        train_df = pd.concat([train_df, train])
        test_df = pd.concat([test_df, test])
    
    

In [41]:
tokenizer = text.Tokenizer(num_words = 20000)
tokenizer.fit_on_texts(train_df['transcript'])

In [42]:
x_train = tokenizer.texts_to_sequences(train_df['transcript'])
x_test = tokenizer.texts_to_sequences(test_df['transcript'])

In [44]:
x_train[0]

[3699,
 320,
 35,
 1533,
 320,
 35,
 103,
 136,
 14,
 150,
 11502,
 4,
 10,
 138,
 11502,
 167,
 3,
 54,
 35,
 123,
 7,
 33,
 38,
 1181,
 2780,
 2330,
 725,
 31,
 14,
 150,
 11502,
 885,
 5105,
 3386,
 7591,
 15520,
 15521,
 3,
 4,
 1377,
 62,
 6,
 2197,
 1303,
 3386,
 35,
 15,
 6,
 650,
 2,
 39,
 138,
 1,
 1060,
 23,
 154,
 568,
 99,
 1933,
 39,
 885,
 11,
 4167,
 179,
 99,
 3700,
 98,
 39,
 35,
 412,
 6,
 755,
 178,
 167,
 35,
 412,
 6,
 755,
 660,
 6,
 73,
 480,
 156,
 109,
 284,
 12,
 287,
 2077,
 27,
 2,
 1,
 160,
 8514,
 5,
 315,
 1571,
 2083,
 3820,
 29,
 3099,
 99,
 19,
 1,
 6430,
 1891,
 3,
 27,
 2,
 7,
 704,
 176,
 4,
 287,
 167,
 704,
 176,
 4,
 287,
 7,
 2077,
 3,
 719,
 704,
 176,
 4,
 807,
 99,
 626,
 99,
 202,
 27,
 2,
 35,
 5616,
 440,
 5617,
 43,
 1257,
 90,
 2137,
 78,
 4,
 123,
 2,
 78,
 1814,
 29,
 1953,
 99,
 35,
 146,
 4,
 1257,
 90,
 2137,
 14,
 142,
 45,
 1,
 8027,
 1221,
 4,
 694,
 99,
 1354,
 82,
 1304,
 5,
 1922,
 2,
 1,
 3386,
 5617,
 125,
 167,
 52,
 22,
 1

In [45]:
max_length = len(max(x_train, key = len))

In [46]:
MAX_SEQUENCE_LENGTH = 500

if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

In [47]:
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

In [48]:
x_train

array([[10322,   116,     7, ...,   103,   136,   167],
       [   22,  2981,   109, ...,    35,   103,   136],
       [  105,   919,  7818, ...,   167,   320,    35],
       ...,
       [10918,    19, 11836, ...,     3,  8942,   621],
       [    0,     0,     0, ...,     4,    10,   229],
       [    0,     0,     0, ...,     9,  1583,  1490]])

In [49]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'in': 5,
 'a': 6,
 'that': 7,
 'is': 8,
 'for': 9,
 'be': 10,
 'it': 11,
 'we': 12,
 'our': 13,
 'i': 14,
 'have': 15,
 'this': 16,
 'by': 17,
 'as': 18,
 'with': 19,
 'not': 20,
 'which': 21,
 'are': 22,
 'will': 23,
 'on': 24,
 'has': 25,
 'their': 26,
 'all': 27,
 'or': 28,
 'been': 29,
 'from': 30,
 'but': 31,
 'they': 32,
 'was': 33,
 'at': 34,
 'you': 35,
 'states': 36,
 'government': 37,
 'an': 38,
 'people': 39,
 'united': 40,
 'its': 41,
 '\r': 42,
 'can': 43,
 '\xa0': 44,
 'who': 45,
 'would': 46,
 'so': 47,
 'more': 48,
 'these': 49,
 'no': 50,
 'them': 51,
 'there': 52,
 'any': 53,
 'if': 54,
 'should': 55,
 'now': 56,
 'congress': 57,
 'one': 58,
 'other': 59,
 'those': 60,
 'do': 61,
 'such': 62,
 'country': 63,
 'my': 64,
 'us': 65,
 'were': 66,
 'upon': 67,
 'than': 68,
 'may': 69,
 'president': 70,
 'time': 71,
 'must': 72,
 'great': 73,
 'american': 74,
 'his': 75,
 'new': 76,
 'had': 77,
 'what': 78,
 'he': 79,
 'world': 80,