In [1]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers




In [2]:
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

fake_df['label'] = 0
true_df['label'] = 1

combined_df = pd.concat([fake_df, true_df], ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"Billionaire Klarman slams Trump, vows to work ...",NEW YORK (Reuters) - Billionaire hedge fund ma...,politicsNews,"August 3, 2016",1
1,SIX FACTS ABOUT Sheriff Joe Arpaio Case That W...,Attorney Dario Navarro was one of several left...,left-news,"Aug 26, 2017",0
2,StanChart closed accounts linked to South Afri...,LONDON (Reuters) - Standard Chartered closed s...,worldnews,"October 19, 2017",1
3,"With China in mind, Japan, India agree to deep...","GANDHINAGAR, India (Reuters) - The leaders of ...",worldnews,"September 14, 2017",1
4,Clinton leads Trump by 5 points in Reuters/Ips...,NEW YORK (Reuters) - U.S. Democratic presiden...,politicsNews,"August 26, 2016",1


In [4]:
combined_df = combined_df[['text', 'label']]
combined_df.head()

Unnamed: 0,text,label
0,NEW YORK (Reuters) - Billionaire hedge fund ma...,1
1,Attorney Dario Navarro was one of several left...,0
2,LONDON (Reuters) - Standard Chartered closed s...,1
3,"GANDHINAGAR, India (Reuters) - The leaders of ...",1
4,NEW YORK (Reuters) - U.S. Democratic presiden...,1


In [5]:
combined_df.shape

(44898, 2)

In [8]:
train_split, val_split = train_test_split(combined_df, train_size=0.8, random_state=1)

In [9]:
train_reviews = train_split['text']
y_train = train_split['label']

val_reviews = val_split['text']
y_val = val_split['label']

In [10]:
collections.Counter(y_train)

Counter({0: 18808, 1: 17110})

In [11]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=20000,
                                               filters='0123456789!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                               lower=True)

In [12]:
tokenizer.fit_on_texts(train_reviews)

In [14]:
# converting the input to a sequence of integers instead of a bag of words
# each word is represneted by a different integer

X_train = tokenizer.texts_to_sequences(train_reviews)
print(X_train[0])

[4635, 1767, 67, 4866, 339, 804, 53, 1767, 1555, 78, 25, 255, 2265, 8, 248, 43, 4419, 2, 666, 1, 111, 5, 174, 19, 880, 96, 2, 1, 1479, 3, 1, 425, 461, 32, 104, 13, 773, 2, 3637, 216, 30, 506, 5196, 262, 7010, 2859, 8, 8415, 1468, 17, 2034, 2, 10546, 16, 1, 555, 8, 851, 9149, 5264, 880, 622, 7100, 38, 19, 303, 69, 13, 157, 4, 266, 3, 129, 712, 9, 1, 4841, 3, 19, 3012, 10, 1420, 142, 2, 207, 587, 16, 104, 865, 2127, 3, 1, 1049, 6, 1, 2957, 3, 752, 2859, 8, 323, 933, 13, 5, 940, 459, 169, 1, 931, 312, 3, 1, 9149, 5264, 2171, 128, 1830, 3971, 761, 68, 2339, 27, 4, 6, 1, 8460, 2859, 5, 459, 1583, 36, 22, 824, 10, 5878, 559, 804, 52, 466, 12, 2859, 87, 1, 165, 13, 17, 24, 577, 415, 25, 4, 529, 18, 3971, 171, 4497, 14, 36, 22, 4406, 5, 7, 13, 36, 256, 1420, 323, 2, 1, 387, 708, 5, 845, 18, 4, 1315, 2, 454, 2, 964, 57, 92, 1767, 8, 2198, 9, 1, 880, 766, 1, 466, 372, 53, 10, 1, 434, 1, 165, 8546, 1, 234, 692, 27, 1, 5, 795, 2, 148, 19, 303, 7, 303, 63, 914, 4, 1952, 2, 2545, 4291, 989, 585, 157

In [16]:
[tokenizer.index_word[x] for x in X_train[0][:5]] , X_train[0][:5]


(['barcelona', 'spain', 'reuters', 'twenty', 'four'],
 [4635, 1767, 67, 4866, 339])

In [28]:
MAX_LEN = 250
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_LEN)

In [32]:
print(X_train[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0  2859   366    83   101     7   876     2  4169    19  1420
  1249    82    18    13     5   339     3    19  2291  2207    47   370
    41   164    65  4769     8  2348   213    12     7    55  2859    47
     2  1193   129  1731     6  4769    14    36    22    24 11752     2
  2344    14  4769    11    52     3   429   387    76   141   387   584
    75   805  1731  6434     2  2891   103   318  3506    50    11  5488
     3  2171  1011  4785    13    12    66  1731   805    43   360    44
   127    32  1049    47  1495  2008  2859    17  1977     6     4  1344
     8  1248  3673    10  1086  1064   141    13    36   844     4  2067
     2  1420     2  2143   131   563  4168    19   

In [33]:
X_val = tokenizer.texts_to_sequences(val_reviews)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAX_LEN)

In [36]:
print(tokenizer.word_index['awesome'])

4823


In [37]:
num_tokens = len(tokenizer.word_index) + 1
embedding_dim = 300

In [38]:
tf.random.set_seed(0)

model = keras.Sequential()

# The 'trainable' property is True by default.
model.add(layers.Embedding(input_dim=num_tokens, 
                           output_dim=embedding_dim, 
                           input_length=MAX_LEN))


model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))
model.add(layers.Dense(64, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))
model.add(layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train, y_train, epochs=20, batch_size=512, validation_data=(X_val, y_val), callbacks=[es_callback])



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [39]:
def sentiment(reviews):
  seqs = tokenizer.texts_to_sequences(reviews)
  seqs = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=MAX_LEN)
  return model.predict(seqs)


In [43]:
fake_news = 'Scientists have discovered a massive underground city beneath the Sahara Desert, believed to be inhabited by a lost civilization that possesses advanced technology far beyond modern capabilities. This hidden metropolis, which spans hundreds of miles, is said to be powered by a mysterious energy source that can cure all known diseases and generate limitless clean energy. Archaeologists claim that this civilization has been in contact with extraterrestrial beings, who have guided them in developing technologies that defy the laws of physics. Government officials are allegedly keeping this discovery a secret to prevent global panic and protect the powerful technologies from falling into the wrong hands.'
real_news = "The government should publish advice for its departments on engaging with young people, including on TikTok, a group of MPs has said.\n\nThe culture, media and sport committee has been looking into countering disinformation online.\n\nIts call comes despite TikTok currently being banned on government devices due to data security concerns.\n\nAccurate information needs to be communicated in a \"relatable\" way, the MPs say.\n\nThe committee says that countering misinformation is particularly important for young people, who are increasingly turning away from traditional media and towards social media for their information.\n\nIt advises meeting young people \"where they are\" - with 15 to 24 year olds spending around an hour per day on TikTok, according to media regulator Ofcom.\n\nThe report says: \"The Government must have a clear strategy for communicating with young people and adapting to the development of new apps and platforms which appeal to this audience.\"\n\nSome MPs do still use TikTok, despite the fact it is blocked on the Parliament Wi-Fi network.\n\nDefence Secretary Grant Shapps has almost 20,000 followers, though he says he does not have the app on his personal phone.\n\n\"Grant knows that TikTok can be a valuable tool for communicating with his constituents,\" a source close to Mr Shapps told the BBC in September 2023.\n\nThe Ministry of Defence also operates a separate account which has about 17,500 followers.\n\nTikTok is under pressure in many countries over its links to the Chinese state - links it has always denied - with law-makers in the US recently passing legislation saying it should be sold or banned.\n\nIt has though endorsed the committee's findings.\n\n\"We welcome this report's recommendation that the Government should engage with the public on whatever platform they choose to use\", it said in a statement.\n\nThe government has responded to the committee report by saying it makes efforts to \"reach people directly on the platforms they spend the most time on.\"\n\nIt adds that the Online Safety Act, which came into law last year, \"will also help tackle the root cause of disinformation\" by requiring social media companies \"to swiftly remove illegal misinformation and disinformation as soon as they become aware of it.\"\n\nBut the law was criticised at the time by fact-checking service Full Fact, which said it did not go far enough \"to address the way that platforms treat harmful misinformation and disinformation.\"\n\nThe MPs took evidence from over 60 different people prior to publishing their report, including disinformation experts and journalists.\n\nAmong these were BBC journalists Rebecca Skippage and Marianna Spring.\n\nAlso interviewed were financial journalist Martin Lewis, Channel 4 journalist Georgina Lee and the chief executive of Full Fact Will Moy"

In [44]:
print(sentiment([fake_news, real_news]))

[[0.00357881]
 [0.99795884]]


In [46]:
model.evaluate(X_val, y_val)



[0.03347622603178024, 0.9909799695014954]