In [39]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

In [40]:
# Load datasets
df1 = pd.read_csv('news.csv')
df2 = pd.read_csv('IFND.csv', encoding='latin1')
df3 = pd.read_csv('news_dataset.csv', encoding='latin1')

In [41]:
# Rename columns
df2 = df2.rename(columns={'Statement': 'text', 'Label': 'label'})

# Concatenate DataFrames
df = pd.concat([df1[['text', 'label']], df2[['text', 'label']], df3[['text', 'label']]], ignore_index=True)

# Drop duplicates and NaN values
df.drop_duplicates(subset='text', inplace=True)
df.dropna(inplace=True)

In [42]:
df.count()

text     64629
label    64629
dtype: int64

In [43]:
df.drop_duplicates(subset='text', inplace=True)
df.dropna(inplace=True)

In [44]:
df.count()

text     64629
label    64629
dtype: int64

In [45]:
X = df['text']  # Features: news articles
y = df['label']  # Labels: fake or real

In [46]:
# Initialize LabelEncoder and fit on the labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert labels to integers

In [47]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=300)

In [48]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [49]:
# Check the data types of y_train and y_test
print(f"y_train dtype: {y_train.dtype}, unique values: {np.unique(y_train)}")
print(f"y_test dtype: {y_test.dtype}, unique values: {np.unique(y_test)}")

y_train dtype: int32, unique values: [0 1 2 3]
y_test dtype: int32, unique values: [0 1 2 3]


In [50]:
# Step 7: Build LSTM-CNN Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=300))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))



In [51]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [52]:
# Train the model with preprocessed data
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m1616/1616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 48ms/step - accuracy: 0.3045 - loss: -6769.6953 - val_accuracy: 0.3324 - val_loss: -64817.8984
Epoch 2/5
[1m1616/1616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 48ms/step - accuracy: 0.3343 - loss: -104365.4375 - val_accuracy: 0.3354 - val_loss: -232967.2812
Epoch 3/5
[1m1616/1616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 52ms/step - accuracy: 0.3393 - loss: -288849.4688 - val_accuracy: 0.3443 - val_loss: -481050.0312
Epoch 4/5
[1m1616/1616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 53ms/step - accuracy: 0.3395 - loss: -557395.6250 - val_accuracy: 0.3395 - val_loss: -735177.8125
Epoch 5/5
[1m1616/1616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 51ms/step - accuracy: 0.3349 - loss: -849172.7500 - val_accuracy: 0.3382 - val_loss: -1160058.2500


<keras.src.callbacks.history.History at 0x2342b5a5450>

In [53]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.3329 - loss: -1165787.5000
Test Accuracy: 33.82%
