In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.regularizers import L1L2
from keras.optimizers import Adam
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

ConcatedReel = pd.read_csv('ConcatenatedReel_preprocessed.csv');
ConcatedFake = pd.read_csv('ConcatenatedFake_preprocessed.csv');


In [3]:
# Concatenate the text_token_lemmatized lists for each row into a single string
ConcatedReel['text_processed'] = ConcatedReel['text_token_lemmatized'].apply(lambda x: ' '.join(x))
ConcatedFake['text_processed'] = ConcatedFake['text_token_lemmatized'].apply(lambda x: ' '.join(x))

ConcatedFake['text_processed'].head(20000)

<class 'pandas.core.series.Series'>
RangeIndex: 28711 entries, 0 to 28710
Series name: text_processed
Non-Null Count  Dtype 
--------------  ----- 
28711 non-null  object
dtypes: object(1)
memory usage: 224.4+ KB


In [4]:
# Create Word2Vec model
w2v_model = Word2Vec(sentences=ConcatedReel['text_token_lemmatized'].tolist() + ConcatedFake['text_token_lemmatized'].tolist(), vector_size=100, min_count=1, workers=4)

# Tokenize text and convert to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(ConcatedReel['text_processed'].tolist() + ConcatedFake['text_processed'].tolist())
sequences = tokenizer.texts_to_sequences(ConcatedReel['text_processed'].tolist() + ConcatedFake['text_processed'].tolist())

# Pad sequences to be the same length
padded_sequences = pad_sequences(sequences, maxlen=1000, padding='post')

# Split data back into real and fake
Reel_padded_sequences = padded_sequences[:len(ConcatedReel)]
Fake_padded_sequences = padded_sequences[len(ConcatedReel):]


# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=1000, weights=[w2v_model.wv.vectors[:len(tokenizer.word_index) + 1]]))
model.add(LSTM(units=50, dropout=0.3, recurrent_dropout=0.5, kernel_regularizer=L1L2(l1=1e-6, l2=1e-5), recurrent_regularizer=L1L2(l1=1e-6, l2=1e-5), bias_regularizer=L1L2(l1=1e-7, l2=1e-6)))
model.add(Dropout(0.3))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Concatenate the real and fake padded sequences and create corresponding labels
all_padded_sequences = np.concatenate([Reel_padded_sequences, Fake_padded_sequences])
all_labels = np.concatenate([np.ones(len(ConcatedReel)), np.zeros(len(ConcatedFake))])

# Split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(all_padded_sequences, all_labels, test_size=0.2, random_state=5, stratify=all_labels)

# Train the model on the training set and validate on the test set
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

# Evaluate the model on the test set
test_eval = model.evaluate(X_test, y_test)
print('Test accuracy: {}'.format(test_eval[1]))

Epoch 1/5


KeyboardInterrupt: 