In [1]:
import zipfile
import os

with zipfile.ZipFile("news.zip", 'r') as zip_ref:
    zip_ref.extractall("data")  # Extract to a folder named "data"


In [15]:
import pandas as pd
import zipfile

# Unzip the file
with zipfile.ZipFile("news.zip", 'r') as zip_ref:
    zip_ref.extractall("news_data")

# Load datasets
fake = pd.read_csv("news_data/Fake.csv")
true = pd.read_csv("news_data/True.csv")

# Add labels: 0 = fake, 1 = real
fake['label'] = 0
true['label'] = 1

# Combine and shuffle
df = pd.concat([fake, true])
df = df.sample(frac=1).reset_index(drop=True)

# Use only 'title' + 'text' for input
df['content'] = df['title'] + " " + df['text']



In [16]:
df.head()

Unnamed: 0,title,text,subject,date,label,content
0,DC CHIEF OF POLICE DENIES CONCEALED WEAPONS PE...,This is a perfect example of how the left is a...,left-news,"Sep 7, 2015",0,DC CHIEF OF POLICE DENIES CONCEALED WEAPONS PE...
1,"A month on, U.S., Niger still disagree on what...",NIAMEY/WASHINGTON (Reuters) - A month after an...,worldnews,"November 3, 2017",1,"A month on, U.S., Niger still disagree on what..."
2,Obama Goes “Full Monty”: Kills Thousands More ...,H1B Visas have been the downfall of thousands ...,Government News,"Jan 1, 2016",0,Obama Goes “Full Monty”: Kills Thousands More ...
3,LOL! CHUCKIE SCHUMER WARNS TRUMP Is “In Troubl...,"The top Democrat in the Senate, Chuck Schumer ...",left-news,"Mar 5, 2017",0,LOL! CHUCKIE SCHUMER WARNS TRUMP Is “In Troubl...
4,"U.S. lifts Sudan sanctions, wins commitment ag...",WASHINGTON (Reuters) - The United States lifte...,worldnews,"October 6, 2017",1,"U.S. lifts Sudan sanctions, wins commitment ag..."


In [9]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Library-06\Desktop\tf_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [17]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df['content'] = df['content'].apply(clean_text)



In [10]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting click
  Downloading click-8.2.1-py3-none-any.whl (102 kB)
Installing collected packages: click, nltk
Successfully installed click-8.2.1 nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Library-06\Desktop\tf_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [18]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

X = df['content'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = 500
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')



In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [20]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=10,
    batch_size=128,
    callbacks=[early_stop]
)



Epoch 1/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 1s/step - accuracy: 0.6958 - loss: 0.5211 - val_accuracy: 0.6224 - val_loss: 0.6183
Epoch 2/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 3s/step - accuracy: 0.6874 - loss: 0.5342 - val_accuracy: 0.6167 - val_loss: 0.5779
Epoch 3/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1278s[0m 5s/step - accuracy: 0.6893 - loss: 0.5247 - val_accuracy: 0.6989 - val_loss: 0.4997
Epoch 4/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1194s[0m 4s/step - accuracy: 0.8398 - loss: 0.3760 - val_accuracy: 0.8986 - val_loss: 0.2944
Epoch 5/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 1s/step - accuracy: 0.8789 - loss: 0.3219 - val_accuracy: 0.8945 - val_loss: 0.3172
Epoch 6/10
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 1s/step - accuracy: 0.8878 - loss: 0.3146 - val_accuracy: 0.8570 - val_loss: 0.3506
Epoch 7/10
[1m281/2

In [22]:
# Save model
# Instead of .h5, save with .keras extension:
model.save("models/lstm_fake_news_model.keras")


# Save tokenizer
import pickle
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
