In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [21]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D

In [22]:
train = pd.read_csv("train.csv")

In [23]:
#drop the id column since the first column is the index anyways
train = train.set_index('id', drop = True)

In [24]:
#I set the max_features to 1500 because my computer couldn't train with all the features
max_features = 1500

In [25]:
#some of the title and author values are not none so classify them as missing instead of null
train[['title', 'author']] = train[['title', 'author']].fillna(value = 'Missing')

#drop any other value that is null, this will only be rows where the text itself is null, so these values are not
#needed
train = train.dropna()
train.isnull().sum()

#count the length of each article by character and append that as a new column
length = []
[length.append(len(str(text))) for text in train['text']]
train['length'] = length

In [26]:
#the articles where the length is less than 50 is mostly nonsense
train['text'][train['length'] < 50]

id
82                                                   
169                                                  
173                                   Guest   Guest  
196            They got the heater turned up on high.
295                                                  
                             ...                     
20350                         I hope nobody got hurt!
20418                                 Guest   Guest  
20431    \nOctober 28, 2016 The Mothers by stclair by
20513                                                
20636                              Trump all the way!
Name: text, Length: 207, dtype: object

In [27]:
#drop those articles
train = train.drop(train['text'][train['length'] < 50].index, axis = 0)
y = train['label'].values

In [28]:
#tokenizer for the text
#num_words is the maximum number of words in the tokenizer, if an article has more than this number of unique 
#words than it drops the ones that haven't yet been seen
#I used None to encode every word it sees

#the oov_token tells the tokenizer what to do when it encounters a word that it hasn't seen before
#this is useful in testing after the model has been trained, in a test article it will change the
#word it hasn't seen befor to <00V> and then encode it, this will keep the length of the article 
#so it doesn't lose too much meaning

#lower converts all the words to lowercase, splits the words by space

#fit_on_texts looks at all the words in the dataset and uses that as the vocab to tokenize

#the filter is the characters to ignore, I may be missing some as I look at the word_index dictionary
#also note it is ignoring "." we may possibly want to encode some punctuation as it does carry some semantic
#meaning? IDK, something to think about...

#texts_to_sequences transforms the list of text to an encoding of each word that appears 
#(Example) text = [1, 3, 4, 2], internal structure of tekenizer contains a dictionary
#dictionary:
# word_index = {
# 'hello' : 1,
# 'you' : 2,
# 'how' : 3,
# 'are' : 4
#}

tokenizer = Tokenizer(num_words = max_features, 
                    filters='!""-#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                    lower = True, 
                    split = ' ',
                    oov_token='<00V>')

#for a baller computer
# tokenizer = Tokenizer(num_words = None, 
#                     filters='!""-#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
#                     lower = True, 
#                     split = ' ',
#                     oov_token='<00V>')

tokenizer.fit_on_texts(texts = train['text'])
X = tokenizer.texts_to_sequences(texts = train['text'])

In [29]:
#key is word, value is encoding of that word
#tokenizer.word_index

In [30]:
#little explanation of X 
#first number in X is 129, which corresponds to the word 'house'
print(X[0][0])
print(tokenizer.word_index['house'])

130
130


In [31]:
#padding the sequence, fills all articles with zeros at the beginning of the article to fit the length
#of the largest sequence of words in the dataset
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

#for a baller computer
#X = pad_sequences(sequences = X, maxlen = None, padding = 'pre')

In [32]:
#splits the data into train and test, note that this dataframe was only the testing data to begin with
#so this split splits the test data further
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train.shape

(16443, 1500)

In [33]:
#number of features for each article
#my computer couldn't train with all the features, if you have a GPU version of tensorflow 
#or just an ungodly amount of RAM, set the input_dim = max_num_features and 
#uncomment all the places that are labeled with "for a baller computer"
max_num_features = X_train.shape[0]
print(max_num_features)

16443


In [34]:
#ahhhh the good stuff
#the input layer is the max num features, which is the number of words in the article with
#the most number of words

#ideally embedding will group similar articles together

#then the first LSTM layer, the recurrent_dropout is the fraction of nodes around specific nodes to input
#into this node

#followed by a dropout layer, this randomly sets random nodes to a weight of zero, its meant to prevent
#overfitting and somewhat more closely mimics how the brain learns 
#(certain neurons learn specific things over time and other neurons "dropout" or rewire if they are not
#involved in a certain process)

#then a dense layer with no dropout

#then another dropout

#final dense layer that is the output of the model, 0 for fake 1 for real
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'sigmoid', name = 'output_layer'))

# compiling the model
#loss is objective function

lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [35]:
#WARNING training the model on my computer took half an hour to run the first time
lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 1)



KeyboardInterrupt: 