In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
# The reason why we need TF_IDF is that it helps in identifying certain terms in a document (in our case a dataset)
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [28]:
df = pd.read_csv('dataset.csv') #We will be loading it later
df=df.dropna()

In [29]:
# Since the scope of our project is smaller than similar projects, we will be setting limits
max_words = 5000
max_length = 100

tokenizer = Tokenizer(num_words=max_words)
# With this, we are trying to update the vocabulary based on the word frequency
tokenizer.fit_on_texts(df['title'])
# Since neural networks require inputs of the same shape, we need to make sure we have padding
sequences = tokenizer.texts_to_sequences(df['title'])
X = pad_sequences(sequences, maxlen=max_length)
y = to_categorical(df['label'])

In [30]:
# Here, we're going to split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# This is where we're trying to build the model
model = Sequential()
model.add(Embedding(max_words, 50, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# Softmax is a mathematical function that takes a vector of real numbers as input and transforms it into a probability  distribution
# Reference - https://deepai.org/machine-learning-glossary-and-terms/softmax-layer
model.add(Dense(2, activation='softmax'))

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# This is where we're trying to train the model
batch_size = 32
epochs = 10
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# Here, we try to evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7908496856689453


In [32]:
# Make predictions on new data
new_data = ["The news headline (fake or not)"]
new_data_sequences = tokenizer.texts_to_sequences(new_data)
new_data_padded = pad_sequences(new_data_sequences, maxlen=max_length)
predictions = model.predict(new_data_padded)

# This is to print the predicted class probabilities
print(f'Predicted Probabilities: {predictions}')

# This is to convert predicted probabilities to class labels (fake or true)
predicted_classes = [1 if prob[1] > prob[0] else 0 for prob in predictions]
print(f'Predicted Classes: {predicted_classes}')

Predicted Probabilities: [[0.9165709  0.08342909]]
Predicted Classes: [0]
