<a href="https://colab.research.google.com/github/raduncc/Network-aware-fake-news-detection-and-mitigation-on-social-media/blob/main/Fake_news_classification_using_model_trained_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the libraries

In [None]:
import pandas as pd
from gensim.models import KeyedVectors
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout, MaxPooling1D
import pickle
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping

# Choose the dataset and load data accordingly

In [None]:
#@title Dataset

ds = 'Fakeddit' #@param ["FNC", "Kaggle", "GossipCop", "Fakeddit"]

In [None]:
if ds == 'FNC':
  data = pd.read_csv('/content/drive/MyDrive/Licenta/Datasets/Fake News Corpus dataset/DATE.csv')
  X = data.iloc[:, -1].values
  y = data.iloc[:, 2].values
if ds == 'Kaggle':
  data = pd.read_csv('/content/drive/MyDrive/Licenta/Datasets/Kaggle/kaggle_cleanv3.csv')
  X = data.iloc[:, 6].values
  y = data.iloc[:, 7].values
if ds == 'GossipCop':
  data = pd.read_csv('/content/drive/MyDrive/Licenta/Datasets/Gossipcop/gossipcop_dataset.csv')
  X = data.iloc[:,2].values
  y = data.iloc[:,-1].values
if ds == 'Fakeddit':
  data = pd.read_csv('/content/drive/MyDrive/Licenta/Datasets/Fakeddit/fakeddit_dataset.csv')
  X = data.iloc[:, 2].values
  y = data.iloc[:, -1].values

#Preprocessing - convert to lowercase, removing stop words, punctuation marks, map every word a unique index for the embedding layer 

In [None]:
lemmatizer = WordNetLemmatizer()

X_clean = []

le = LabelEncoder()
y = le.fit_transform(y)

for i in range(len(X)):
  content = re.sub('[^a-zA-Z]', ' ', X[i])
  content = content.lower()
  content = content.split()

  content = [lemmatizer.lemmatize(word) for word in content if not word in stopwords.words('english')]
  content = ' '.join(content)
  X_clean.append(content)

In [None]:
X = []
for xs in X_clean:
  tokenized = xs.split()
  X.append(tokenized)

# Train word embeddings

In [None]:
EMBEDDING_DIM = 100

w2v_model = gensim.models.Word2Vec(sentences=X, size=EMBEDDING_DIM, window=5, min_count=1)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)

# # of articles by post-processing length

In [None]:
plt.hist([len(x) for x in X], bins=500)
plt.xlabel('Length')
plt.ylabel('#articles')
plt.show()

#Equalize the lengths of the articles according to the above histogram

In [None]:
if ds in ['FNC', 'Kaggle']:
  len_to_pad = 1000
if ds in ['GossipCop', 'Fakeddit']:
  len_to_pad = 10

X = pad_sequences(X, maxlen=len_to_pad)

In [None]:
voc_size = len(tokenizer.word_index) + 1

wm = np.zeros((voc_size, EMBEDDING_DIM))

for w, i in tokenizer.word_index.items():
  wm[i] = w2v_model.wv[w]

#Choose the model you want to use in classification

In [None]:
#@title Model

mdl = 'Stacked' #@param ["Conv", "Stacked"]

In [None]:
if mdl == 'Conv':
  model = Sequential()
  model.add(Embedding(voc_size, output_dim=EMBEDDING_DIM, weights=[wm], input_length=len_to_pad, trainable=False))
  model.add(Conv1D(128, 3, activation='relu'))
  model.add(MaxPooling1D())
  model.add(Dense(256))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dense(128))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
if mdl == 'Stacked':
  model = Sequential()
  model.add(Embedding(voc_size, output_dim=EMBEDDING_DIM, weights=[wm], input_length=len_to_pad, trainable=False))
  model.add(Bidirectional(LSTM(128, return_sequences=True)))
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(128, return_sequences=True)))
  model.add(Dropout(0.2))
  model.add(Bidirectional(LSTM(128)))
  model.add(Dense(128))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Make train and test splits, train the models and show the results

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) 

In [None]:
es = EarlyStopping(monitor='val_loss', patience=5)
model.fit(X_train, y_train, validation_split=0.2, epochs=200, callbacks=[es], batch_size=128)

In [None]:
y_pred = (model.predict(X_test) >= 0.5).astype("int")

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, digits=5))

# Save the model for using it in the website

In [None]:
#@title Do you want to save the model?
save_mdl = False #@param {type:"boolean"}

In [None]:
if save_mdl:
  model.save('/content/drive/MyDrive/Licenta/model.h5')
  pickle.dump(tokenizer, open('/content/drive/MyDrive/Licenta/tokenizer', "wb" ))