## Attributes
- text: the text of the article; could be incomplete
- clean_news: news that is processed
- label: a label that marks the article as potentially unreliable
    - 1: unreliable
    - 0: reliable

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import warnings
from keras.layers import LSTM, Dropout, Dense, Embedding
from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
def data_process(dataframe):
    df = dataframe
    df = df.drop(columns=['id', 'title', 'author'], axis=1)
    df = df.dropna(axis=0)
    df['clean_news'] = df['text'].str.lower()
    df['clean_news'] = df['clean_news'].str.replace('[^A-Za-z0-9\s]', '')
    df['clean_news'] = df['clean_news'].str.replace('\n', '')
    df['clean_news'] = df['clean_news'].str.replace('\s+', ' ')

    # delete stopwords and punctuation
    stop = stopwords.words('english')
    df['clean_news'] = df['clean_news'].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))

    return df

In [3]:
# import training data
parent_path = '/Users/zch/Desktop/FakeNewsDetector/dataset/fake-news-FromKaggle/'
df = pd.read_csv(parent_path + 'train.csv')
df = data_process(df)
df.head(2)

Unnamed: 0,text,label,clean_news
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide didnt even see comeys letter ja...
1,Ever get the feeling your life circles the rou...,0,ever get feeling life circles roundabout rathe...


In [4]:
# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_news'])
word_index = tokenizer.word_index
vocab_size = len(word_index)

In [5]:
# padding data
sequences = tokenizer.texts_to_sequences(df['clean_news'])
padded_seq = pad_sequences(sequences, maxlen=500, padding='post', truncating='post')

In [6]:
# create embedding index
embedding_index = {}
with open('/Users/zch/Downloads/archive/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [7]:
# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(padded_seq, df['label'], test_size=0.20, random_state=42, stratify=df['label'])

In [9]:
model = Sequential([
    Embedding(vocab_size+1, 100, weights=[embedding_matrix], trainable=False),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(256),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')

2022-09-26 18:58:12.628882: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=256, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# store model
model.save('m2.h5')