# Fake News Detector

In [4]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm


In [None]:
true_dataset = pd.read_csv('data/True.csv')
fake_dataset = pd.read_csv('data/Fake.csv')

In [9]:
from hazm import Normalizer, SentenceTokenizer, word_tokenize
normalizer = Normalizer()
sent_tokenizer = SentenceTokenizer()

In [None]:


true_dataset['title'] = true_dataset.apply(lambda row: row['title'].lower(), axis=1)
fake_dataset['title'] = fake_dataset.apply(lambda row: row['title'].lower(), axis=1)

true_dataset['text'] = true_dataset.apply(lambda row: row['text'].lower(), axis=1)
fake_dataset['text'] = fake_dataset.apply(lambda row: row['text'].lower(), axis=1)

In [None]:
true_dataset['text']

In [None]:
true_dataset['text'] = true_dataset.apply(lambda row: normalizer.normalize(row['text']), axis=1)

true_dataset['title'] = true_dataset.apply(lambda row: normalizer.normalize(row['title']), axis=1)

fake_dataset['text'] = fake_dataset.apply(lambda row: normalizer.normalize(row['text']), axis=1)

fake_dataset['title'] = fake_dataset.apply(lambda row: normalizer.normalize(row['title']), axis=1)

In [None]:
# true_dataset.to_csv('data/TrueV1.csv', index=False)
# fake_dataset.to_csv('data/FakeV1.csv', index=False)

In [None]:
# true_dataset = pd.read_csv('data/TrueV1.csv')
# fake_dataset = pd.read_csv('data/FakeV1.csv')

In [None]:
def remove_numbers(text):
    return str(re.sub(r'\d+', '', text))

def remove_urls(text):
    return str(re.sub(r'http[s]?://\S+|www\.\S+', '', text))

### Remove num and url

In [None]:
true_dataset['text'] = true_dataset.apply(lambda row: remove_numbers(remove_urls(str(row['text']))), axis=1)

true_dataset['title'] = true_dataset.apply(lambda row: remove_numbers(remove_urls(str(row['title']))), axis=1)

fake_dataset['text'] = fake_dataset.apply(lambda row: remove_numbers(remove_urls(str(row['text']))), axis=1)

fake_dataset['title'] = fake_dataset.apply(lambda row: remove_numbers(remove_urls(str(row['title']))), axis=1)

### Remove puncs

In [None]:
true_dataset['text'] = true_dataset.apply(lambda row: re.sub(r'[^\w\s]', '', row["text"]), axis=1)

true_dataset['title'] = true_dataset.apply(lambda row: re.sub(r'[^\w\s]', '', row["title"]), axis=1)

fake_dataset['text'] = fake_dataset.apply(lambda row: re.sub(r'[^\w\s]', '', row["text"]), axis=1)

fake_dataset['title'] = fake_dataset.apply(lambda row: re.sub(r'[^\w\s]', '', row["title"]), axis=1)

In [None]:
fake_dataset

In [None]:
# true_dataset.to_csv('data/TrueV2.csv', index=False)
# fake_dataset.to_csv('data/FakeV2.csv', index=False)

### Tokenize

In [None]:
# true_dataset['text'] = true_dataset.apply(lambda row: word_tokenize(row['text']), axis=1)

# true_dataset['title'] = true_dataset.apply(lambda row: word_tokenize(row['title']), axis=1)

# fake_dataset['text'] = fake_dataset.apply(lambda row: word_tokenize(row['text']), axis=1)

# fake_dataset['title'] = fake_dataset.apply(lambda row: word_tokenize(row['title']), axis=1)

# Training KNN

In [5]:
# from tensorflow.keras.preprocessing.text import Tokenizer # depracated
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

vectorize_layer = TextVectorization(
    max_tokens=100,  # Maximum number of unique tokens
    standardize='lower_and_strip_punctuation',  # Normalize text
    output_sequence_length=3  # Output sequence length
)


In [6]:
true_dataset = pd.read_csv('data/TrueV2.csv')
fake_dataset = pd.read_csv('data/FakeV2.csv')

In [None]:
true_dataset['label'] = 1
fake_dataset['label'] = 0
dataset = pd.concat([true_dataset, fake_dataset], ignore_index=True)
# dataset.to_csv('data/datasetV1.csv', index=False)

In [10]:
dataset["text"] = dataset["text"].astype(str).apply(sent_tokenizer.tokenize)
dataset["title"] = dataset["title"].astype(str).apply(sent_tokenizer.tokenize)


In [None]:
# depracated
# for series in [dataset['text'], dataset['title']]: 
#     for row in tqdm(series, total=series.shape[0], desc="Fitting tokenizer"):
#         tokenizer.fit_on_texts(row)

In [11]:
# Adapt the layer to the data
for series in [dataset['text'], dataset['title']]: 
    for row in tqdm(series, total=series.shape[0], desc="Fitting tokenizer"):
        vectorize_layer.adapt(row)

Fitting tokenizer: 100%|██████████| 44898/44898 [01:50<00:00, 406.07it/s]
Fitting tokenizer: 100%|██████████| 44898/44898 [01:40<00:00, 445.68it/s]


In [18]:
# Convert text to tokens
sentences_to_tokens_text = []
sentences_to_token_title = []
for series in tqdm(dataset['text'], total=dataset["text"].shape[0], desc="Converting text to tokens"): 
    for row in series:
        sentences_to_tokens_text.append(vectorize_layer(row))

for series in tqdm(dataset['title'], total=dataset["title"].shape[0], desc="Converting text to tokens"):
    for row in series:
        sentences_to_token_title.append(vectorize_layer(row))
    

Converting text to tokens: 100%|██████████| 44898/44898 [01:45<00:00, 424.66it/s]
Converting text to tokens: 100%|██████████| 44898/44898 [01:41<00:00, 442.79it/s]


In [19]:
dataset["tokens_text"] = sentences_to_tokens_text
dataset["tokens_title"] = sentences_to_token_title

In [None]:
dataset["text_sequences"] = dataset["text"].apply(vectorize_layer.texts_to_sequences)
dataset["title_sequences"] = dataset["title"].apply(vectorize_layer.texts_to_sequences)


In [None]:
dataset["padded_text_seqs"] = dataset["text_sequences"].apply(pad_sequences(padding='post'))
dataset["padded_title_seqs"] = dataset["title_sequences"].apply(pad_sequences(padding='post'))

In [None]:
dataset.to_csv('data/datasetV2.csv', index=False)

In [23]:
dataset.to_pickle('data/datasetV2.pkl')