In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/yapay_zeka_video/S-003-Fake-and-Real-News-Detector")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import json
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# dataset downloaded : https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset
# upload data

temp_fake=pd.read_csv("fake_real_news_dataset/Fake.csv")
temp_fake['status']=0 #Fake

temp_real=pd.read_csv("fake_real_news_dataset/True.csv")
temp_real['status']=1 #Real

data=pd.concat([temp_fake,temp_real])
data['text']=data['title']+" "+data['text']
data.drop(['title','subject','date'],axis=1,inplace=True)

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

del temp_fake,temp_real
data.head(10)

Unnamed: 0,text,status
0,Ben Stein Calls Out 9th Circuit Court: Committ...,0
1,Trump drops Steve Bannon from National Securit...,1
2,Puerto Rico expects U.S. to lift Jones Act shi...,1
3,OOPS: Trump Just Accidentally Confirmed He Le...,0
4,Donald Trump heads for Scotland to reopen a go...,1
5,Paul Ryan Responds To Dem’s Sit-In On Gun Con...,0
6,AWESOME! DIAMOND AND SILK Rip Into The Press: ...,0
7,STAND UP AND CHEER! UKIP Party Leader SLAMS Ge...,0
8,North Korea shows no sign it is serious about ...,1
9,Trump signals willingness to raise U.S. minimu...,1


In [None]:
# analyze and visualize data
print("\n<======Info======>\n")
print(data.info())
print("\n<======Describe======>\n")
print(data.describe())
print("\n<======NA size======>\n")
print(data.isna().sum())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   status  44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB
None


             status
count  44898.000000
mean       0.477015
std        0.499477
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000


text      0
status    0
dtype: int64


In [None]:
#preprossesing data
def remove_punctuation(text):
  translator = str.maketrans("", "", string.punctuation)
  return text.translate(translator)

# remove_stopwords değiştir

# stop = set(stopwords.words("english"))
# def remove_stopwords(text):
#   filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
#   return " ".join(filtered_words)

data["text"] = data["text"].map(remove_punctuation)
# data["text"] = data["text"].map(remove_stopwords)

# Count unique words
def counter_word(text_col):
  count = Counter()
  for text in text_col.values:
    for word in text.split():
      count[word] += 1
  return count


counter = counter_word(data["text"])
num_unique_words = len(counter)

# Split dataset into training and validation set
train_size = int(data.shape[0] * 0.8)

train_data = data[:train_size]
val_data = data[train_size:]

# split text and labels
train_sentences = train_data['text'].to_numpy()
train_labels = train_data['status'].to_numpy()
val_sentences = val_data['text'].to_numpy()
val_labels = val_data['status'].to_numpy()

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training
# each word has unique index
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

# Max number of words in a sequence
max_length = 600

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
  return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])


In [None]:
# create model
model = tf.keras.models.Sequential()
# model.add(layers.InputLayer(input_shape=(train_padded.shape[-1],)))
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

model.add(layers.LSTM(128,dropout=0.2))
# model.add(layers.Dropout(0.0))

model.add(layers.Dense(1, activation="sigmoid"))

# from_logits=False değiştir
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optim = tf.keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 600, 32)           9292480   
                                                                 
 lstm_8 (LSTM)               (None, 128)               82432     
                                                                 
 dense_14 (Dense)            (None, 1)                 129       
                                                                 
Total params: 9,375,041
Trainable params: 9,375,041
Non-trainable params: 0
_________________________________________________________________


In [None]:
# train_labels = tf.one_hot(train_labels, depth=1)
# val_labels = tf.one_hot(val_labels, depth=1)

In [None]:
# train the model
model.fit(train_padded, train_labels, epochs=20,batch_size=1024, validation_data=(val_padded, val_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f60c2d48bd0>

In [None]:
# evaluate model
evaluate_data=model.evaluate(train_padded, train_labels,verbose=0,batch_size=2048)
print("Trian :",evaluate_data)
evaluate_data=model.evaluate(val_padded, val_labels,batch_size=2048,verbose=0)
print("Trian :",evaluate_data)

Trian : [0.15426617860794067, 0.9628041386604309]
Trian : [0.185619056224823, 0.9521158337593079]


In [None]:
# save model
model.save("fake_and_real_news_detector.h5")

In [None]:
# save tokinezer
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(tokenizer_json, ensure_ascii=False))