In [None]:
import opendatasets as od
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from nltk.tokenize import wordpunct_tokenize

import tensorflow as tf

In [None]:
#import data from kaggle
od.download("https://www.kaggle.com/datasets/hassanamin/textdb3")

In [None]:
#create a df for the data
df = pd.read_csv("textdb3/fake_or_real_news.csv")
df.head()

In [None]:
#create helper functions
def convertLabelsToBinary(label):
  if label == "FAKE":
    return 1
  return 0

def removeNewLine(text):
  return re.sub(r'\n', '', text)

def removeSpecialCharacters(text):
  return re.sub(r'[^\w\s]', '', text)

def cleanText(text):
  text = removeNewLine(text)
  text = removeSpecialCharacters(text)
  return text

In [None]:

#! create NLP pipeline

# title_martix - numpy like object with the shape of (m,1) where m is number of title samples
# text_matrix - numpy like obejct with the shape of (m,1) where m is number of text samples
# label_matrix - numpy like object with the shape of (m,1) where m is number of label samples
# test_size - split of the data that will be used for testing the model
# vectorizer_method - tfidf (default), BoW
# n_gram_n_min - smallest number of words grouped to together for n_grams (default is 1 word)
# n_gram_n_max - largest number of words grouped to together for n_grams (default is 1 word)
# min_df - threshold of frequency of words within dataset (default is 0.01 : if word in used in less than 1% of the document it will be removed)

def pipeline2(title_matrix, text_matrix, label_matrix, tokenizer, test_size=0.2, vectorizer_method="tfidf", n_gram_n_min=1, n_gram_n_max=1, min_df=0.01 ):

  data = pd.concat([title_matrix, text_matrix, label_matrix], axis=1).reset_index()
  print("Combined Data")

  #updates labels to values of 0 and 1 where 1 is fake
  data['label'] = data['label'].apply(convertLabelsToBinary)
  print("Updated Labels")

  #creates a new column for cleaned up verison of data 
  data['text'] = data['text'].apply(removeNewLine)
  data['text'] = data['text'].apply(removeSpecialCharacters)
  print("Cleaned Up Text")

  data['words'] =  data['title'] + " " + data['text']
  print("Added Title")
 
  #splitting up the data
  X_train, X_test, y_train, y_test = train_test_split(data['words'], data['label'], test_size=test_size, random_state=42)
  print("Splited Data")

  #joining the input data for vectorization
  X_temp = pd.concat([X_train,X_test])
  print("Combined Data")

  #preforms vectorization 
  vectorizer = TfidfVectorizer(ngram_range=(n_gram_n_min,n_gram_n_max), analyzer="word", lowercase=True, tokenizer=tokenizer, stop_words='english', min_df=min_df )
  if vectorizer_method == "BoW":
    vectorizer = CountVectorizer(ngram_range=(n_gram_n_min,n_gram_n_max), analyzer="word", lowercase=True, tokenizer=tokenizer, stop_words='english', min_df=min_df )
  vectorizer.fit(X_temp)
  X_train = vectorizer.transform(X_train).toarray()
  X_test = vectorizer.transform(X_test).toarray()
  print("Vectorized Data")

  return X_train, X_test, y_train, y_test, vectorizer, X_train.shape

In [None]:
# Collect Data and Objects
X_train, X_test, y_train, y_test, vectorizer, input_shape = pipeline2(df['title'], df['text'], df['label'], wordpunct_tokenize, 
vectorizer_method="tfidf", test_size=0.2, n_gram_n_min=1, n_gram_n_max=1, min_df=0.01)

In [None]:
# Creating a Deep Learning Model

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, input_shape=( None, input_shape[1]), activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Training Deep Learning Model
EPOCHS_VALUE = 50
history = model.fit(X_train, y_train, epochs=EPOCHS_VALUE, validation_data=(X_test, y_test))

In [None]:
# Selecting Data About Model
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(EPOCHS_VALUE)

In [None]:
#Plotting Accuracy Across Epochs
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

In [None]:
#Plotting Loss Across Epochs
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Validation Loss")
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Collecting Predictions with New Data
y_pred = model.predict(X_test).round()

In [None]:
# Plotting Model's Confusion Matrix 
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
#Getting Classification Report on Model
print(classification_report(y_test, y_pred, target_names=['REAL','FAKE',]))

In [None]:
#saving all the data and objects
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
pickle.dump(model, open('deepLearningModel.pkl', 'wb'))
model.save('deepLearningModel')

In [None]:
model.save('deepLearningModel.hdf5')