In [None]:
%pip install gensim




In [None]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# connecting google drive
from google.colab import drive
drive.mount('/content/drive')
# loading datasets
fake=pd.read_csv('/content/drive/MyDrive/datasets/UniProject/Fake.csv')
true=pd.read_csv('/content/drive/MyDrive/datasets/UniProject/True.csv')

Mounted at /content/drive


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Add labels: 0 = Fake, 1 = Real
fake["label"] = 0
true["label"] = 1

In [None]:
# Drop unnecessary columns
drop_cols = ["title", "subject", "date"]  # keep only 'text'
for df in [fake, true]:
    for col in drop_cols:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)


In [None]:
# concat and suffle
df = pd.concat([fake, true], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import re
import string
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()                              # Lowercase
    text = re.sub(r'\[.*?\]', '', text)                  # Remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)   # Remove URLs
    text = re.sub(r'<.*?>+', '', text)                  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)                     # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)               # Remove words containing numbers
    tokens = word_tokenize(text)                        # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

In [None]:
# Apply cleaning
df['text'] = df['text'].apply(clean_text)

In [None]:
# drop duplicates and nan's
df.drop_duplicates( inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
X = df["text"].values
y = df["label"].values

In [None]:
# downloading google news word2vec
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")



In [None]:
# saving word2vec_model in models folder
word2vec_model.save("word2vec_model.bin")

In [None]:
import numpy as np

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 50000
MAX_LEN = 300   # since your 95% length ~516, we cap at 300 for efficiency

tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(sequences, maxlen=MAX_LEN)

word_index = tokenizer.word_index
print("Vocab size:", len(word_index))

# Create embedding matrix
EMB_DIM = 300
embedding_matrix = np.zeros((min(MAX_VOCAB, len(word_index)+1), EMB_DIM))

for word, i in word_index.items():
    if i >= MAX_VOCAB:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]


Vocab size: 199476


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten,ReLU
from tensorflow.keras.callbacks import EarlyStopping
# Build ANN model
model = Sequential()

model.add(Embedding(input_dim=MAX_VOCAB,
                    output_dim=300,
                    input_length=MAX_LEN))
model.layers[0].trainable = False
model.add(Flatten())

# Hidden Layers with LeakyReLU + Dropout
model.add(Dense(128))
model.add(ReLU())
model.add(Dropout(0.7))

model.add(Dense(64))
model.add(ReLU())
model.add(Dropout(0.7))

# Output Layer
model.add(Dense(1, activation="sigmoid"))
# early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


# Compile
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()




In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=10,
                    batch_size=128,
                    callbacks=[early_stop])


Epoch 1/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.6082 - loss: 0.6602 - val_accuracy: 0.8482 - val_loss: 0.3624
Epoch 2/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7652 - loss: 0.4198 - val_accuracy: 0.8952 - val_loss: 0.3078
Epoch 3/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8189 - loss: 0.3549 - val_accuracy: 0.8586 - val_loss: 0.2787
Epoch 4/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8301 - loss: 0.3229 - val_accuracy: 0.8976 - val_loss: 0.2587
Epoch 5/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8559 - loss: 0.2923 - val_accuracy: 0.9002 - val_loss: 0.2783
Epoch 6/10
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8530 - loss: 0.2905 - val_accuracy: 0.8996 - val_loss: 0.2409
Epoch 7/10
[1m193/193[0m 

In [None]:
# test ovverfittingg
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9185 - loss: 0.2016
Test Loss: 0.20295560359954834
Test Accuracy: 0.9183514714241028


In [None]:
X_train.shape, X_test.shape, X_val.shape

((24691, 300), (7716, 300), (6173, 300))

In [None]:
df.shape

(38580, 2)

In [None]:
X_train.shape, X_test.shape, X_val.shape

((24691, 300), (7716, 300), (6173, 300))

In [None]:
def predict_fake_news(text):
    """
    Predicts if a given text is fake or real using the trained model.

    Args:
        text (str): The input text to classify.

    Returns:
        str: "Real" if the model predicts 1, "Fake" if the model predicts 0.
    """
    # Clean the text
    cleaned_text = clean_text(text)

    # Tokenize and pad the text
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LEN)

    # Predict using the model
    prediction = model.predict(padded_sequence)

    # Return the prediction label
    if prediction[0][0] > 0.5:
        return "Real"
    else:
        return "Fake"

# Example usage with a real-life text
real_life_text = "Demonstrators gathered in parts of England including Bristol, Liverpool and London, as well as in Mold in Wales, Perth in Scotland and County Antrim in Northern Ireland. Police stepped in to keep opposing groups separated in many places where anti-racism campaigners mounted counter-demonstrations. It comes after a High Court ruling this week blocked a hotel in Epping, in Essex, from accommodating asylum seekers and some local authorities in England said they were now considering legal challenges."
prediction = predict_fake_news(real_life_text)
print(f"The text is predicted as: {prediction}")

# Example with a potentially fake news headline
fake_news_text = "alien invaded to usa  in the midnight where alot of people get hurt and kiilled 233  people."
prediction_fake = predict_fake_news(fake_news_text)
print(f"The text is predicted as: {prediction_fake}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
The text is predicted as: Real
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
The text is predicted as: Fake
