<a href="https://colab.research.google.com/github/22070049/Deep-Learning/blob/main/true_and_fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import pandas as pd
import zipfile
import os

# Unzip the file
zip_path = '/content/archive (5).zip'
extract_path = '/tmp/fake-news-detection-datasets'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load datasets
true_df = pd.read_csv(os.path.join(extract_path, 'News _dataset', 'True.csv'))
fake_df = pd.read_csv(os.path.join(extract_path, 'News _dataset', 'Fake.csv'))


# Add labels
true_df['label'] = 1  # Real news
fake_df['label'] = 0  # Fake news

# Combine datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: '/content/archive (5).zip'

In [None]:
df

### 2. Text Preprocessing

In [None]:
!pip install nltk

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    filtered_words = [w for w in words if w not in stop_words]
    return ' '.join(filtered_words)

df['clean_text'] = df['text'].apply(clean_text)

In [None]:
df

### 3. Tokenization & Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
maxlen = 300
X = pad_sequences(X, maxlen=maxlen)

y = df['label'].values

vocab_size = len(tokenizer.word_index) + 1


In [None]:
# vocab_size
# tokenizer
# X
y

### 4. Train-Test-Vald Split

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: First split off the test set (10%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 2: Now split remaining data into train and validation (10% of total = 1/9 of remaining)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, random_state=42)


### 5.1 LSTM Model

In [None]:
# !pip install --upgrade --force-reinstall tensorflow

In [None]:
embedding_dim = 128

In [None]:
import os
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, CSVLogger

import os
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, Bidirectional, Dense, Dropout
from keras.callbacks import ModelCheckpoint, CSVLogger

# Ensure the save directory exists
os.makedirs("saved_models", exist_ok=True)

# Define the models you want to train
model_types = ['rnn', 'birnn', 'lstm', 'bilstm']
histories = {}
results = {}

# Build function
def build_model(model_type='rnn'):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))

    if model_type == 'rnn':
        model.add(SimpleRNN(64, dropout=0.3, recurrent_dropout=0.3))
    elif model_type == 'birnn':
        model.add(Bidirectional(SimpleRNN(64, dropout=0.3, recurrent_dropout=0.3)))
    elif model_type == 'lstm':
        model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
    elif model_type == 'bilstm':
        model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [None]:
import os
from keras.callbacks import ModelCheckpoint, CSVLogger

# Create directory for saving models
os.makedirs("saved_models", exist_ok=True)


In [None]:
for mtype in model_types:
    print(f"\n🔁 Training model: {mtype.upper()}")

    model = build_model(mtype)

    # Callbacks
    checkpoint = ModelCheckpoint(
        filepath=f'saved_models/{mtype.upper()}_epoch_{{epoch:02d}}_valacc_{{val_accuracy:.4f}}.keras',
        monitor='val_accuracy',
        save_best_only=False,
        verbose=1
    )
    csv_logger = CSVLogger(f'{mtype.upper()}_training_log.csv', append=True)

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=5,
        batch_size=64,
        callbacks=[checkpoint, csv_logger],
        verbose=1
    )

    scores = model.evaluate(X_val, y_val, verbose=0)
    results[mtype] = scores
    histories[mtype] = history


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, jaccard_score

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_binary))
print(classification_report(y_test, y_pred_binary))

mAP = average_precision_score(y_test, y_pred)
iou = jaccard_score(y_test, y_pred_binary)

print(f"mAP: {mAP:.4f}")
print(f"IoU: {iou:.4f}")


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from keras.models import load_model
import matplotlib.pyplot as plt
import seaborn as sns

# List of model types to evaluate
model_types = ['RNN_epoch_05_valacc_0.8693', 'BIRNN_epoch_05_valacc_0.6452', 'LSTM_epoch_05_valacc_0.9857', 'BILSTM_epoch_05_valacc_0.9869']
histories = {}

# Load and evaluate each model
for model_type in model_types:
    print(f"\nEvaluating {model_type} model...\n")
    # /kaggle/input/all_model/keras/default/1/BILSTM_epoch_05_valacc_0.9869.keras
    # Load the best model for the current type
    model_path = f'/kaggle/input/all_model/keras/default/1/{model_type}.keras'  # Make sure to adjust the model paths correctly
    model = load_model(model_path)

    # Predict using the model
    y_pred = model.predict(X_test)  # Assuming X_test is the test data
    y_pred_class = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels (0 or 1)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_class)
    precision = precision_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    f1 = f1_score(y_test, y_pred_class)
    conf_matrix = confusion_matrix(y_test, y_pred_class)

    # Print out the results
    print(f'Accuracy for {model_type}: {accuracy}')
    print(f'Precision for {model_type}: {precision}')
    print(f'Recall for {model_type}: {recall}')
    print(f'F1-score for {model_type}: {f1}')

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for {model_type}')
    plt.show()

    # Store the model's history (if needed for further analysis)
    histories[model_type] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'conf_matrix': conf_matrix
    }

# At this point, you have evaluated all models and plotted their confusion matrices.


# Task 5: Text Style Analysis

### ✍️ Feature Engineering
### Word Frequency: CountVectorizer or TfidfVectorizer

### Sentence Structure: Avg sentence length, punctuation frequency

### Sentiment: TextBlob or VADER

### Readability: Flesch-Kincaid from textstat

In [None]:
!pip install textstat

In [None]:
!pip install textblob

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# from textblob import TextBlob
# import textstat

# def extract_features(text):
#     blob = TextBlob(text)
#     sentiment = blob.sentiment.polarity
#     subjectivity = blob.sentiment.subjectivity
#     readability = textstat.flesch_reading_ease(text)
#     return pd.Series([sentiment, subjectivity, readability])

import textstat
from textblob import TextBlob

def extract_text_features(text):
    blob = TextBlob(text)
    word_count = len(blob.words)
    sentence_count = len(blob.sentences)
    avg_word_length = sum(len(word) for word in blob.words) / word_count if word_count else 0
    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    readability = textstat.flesch_reading_ease(text)

    return pd.Series([
        word_count,
        avg_word_length,
        avg_sentence_length,
        polarity,
        subjectivity,
        readability
    ])

df_features = df['clean_text'].apply(extract_text_features)
df_features.columns = [
    'word_count', 'avg_word_len', 'avg_sent_len',
    'polarity', 'subjectivity', 'readability'
]

# Add label
df_features['label'] = df['label']


In [None]:
df_features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Prepare data
X_feat = df_features.drop('label', axis=1)
y_feat = df_features['label']

X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feat, y_feat, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train_feat = scaler.fit_transform(X_train_feat)
X_test_feat = scaler.transform(X_test_feat)

# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

for name, model in models.items():
    print(f"\n🧠 Training {name}...")
    model.fit(X_train_feat, y_train_feat)
    preds = model.predict(X_test_feat)
    print(confusion_matrix(y_test_feat, preds))
    print(classification_report(y_test_feat, preds))


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Reshape for LSTM
X_train_feat_lstm = X_train_feat.reshape((X_train_feat.shape[0], X_train_feat.shape[1], 1))
X_test_feat_lstm = X_test_feat.reshape((X_test_feat.shape[0], X_test_feat.shape[1], 1))

# Build LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_feat_lstm.shape[1], 1), dropout=0.3, recurrent_dropout=0.3),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
lstm_model.fit(X_train_feat_lstm, y_train_feat, epochs=50, batch_size=32, validation_data=(X_test_feat_lstm, y_test_feat))

# Evaluate
dl_preds = lstm_model.predict(X_test_feat_lstm)
dl_preds_binary = (dl_preds > 0.5).astype(int)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test_feat, dl_preds_binary))
print(classification_report(y_test_feat, dl_preds_binary))

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Input shape
input_shape = X_train_feat.shape[1]

# Simple feed-forward NN (we can later wrap it into LSTM if needed)
dl_model = Sequential([
    Dense(64, input_dim=input_shape, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

dl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dl_model.fit(X_train_feat, y_train_feat, epochs=50, batch_size=32, validation_data=(X_test_feat, y_test_feat))

# Evaluate
dl_preds = dl_model.predict(X_test_feat)
dl_preds_binary = (dl_preds > 0.5).astype(int)
print(confusion_matrix(y_test_feat, dl_preds_binary))
print(classification_report(y_test_feat, dl_preds_binary))

In [None]:
dl_model.fit(X_train_feat, y_train_feat, epochs=50, batch_size=32, validation_data=(X_test_feat, y_test_feat))

# Evaluate
dl_preds = dl_model.predict(X_test_feat)
dl_preds_binary = (dl_preds > 0.5).astype(int)
print(confusion_matrix(y_test_feat, dl_preds_binary))
print(classification_report(y_test_feat, dl_preds_binary))

In [None]:
# Save the model
dl_model.save("style_feature_dl_model.keras")

In [None]:
from keras.models import load_model

# Load the saved model
loaded_model = load_model("style_feature_dl_model.keras")

In [None]:
import nltk
nltk.download('punkt')
import textstat
from textblob import TextBlob
import pandas as pd
import numpy as np
from keras.models import load_model

# Load the trained model
model = load_model("/kaggle/input/style_feature_dl_model/keras/default/1/style_feature_dl_model.keras")

# -------------------------------
# Text Feature Extraction Function
# -------------------------------
def extract_text_features(text):
    blob = TextBlob(text)
    word_count = len(blob.words)
    sentence_count = len(blob.sentences)
    avg_word_length = sum(len(word) for word in blob.words) / word_count if word_count else 0
    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    readability = textstat.flesch_reading_ease(text)

    return np.array([[
        word_count,
        avg_word_length,
        avg_sentence_length,
        polarity,
        subjectivity,
        readability
    ]])

# -------------------------------
# Prediction Pipeline
# -------------------------------
def predict_fake_real(text):
    features = extract_text_features(text)
    prediction = model.predict(features)
    label = "FAKE" if prediction[0][0] > 0.5 else "REAL"
    confidence = float(prediction[0][0]) if label == "FAKE" else 1 - float(prediction[0][0])
    return label, confidence

# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    user_input = input("Enter news text to classify as FAKE or REAL:\n")
    label, confidence = predict_fake_real(user_input)
    print(f"\nPredicted Label: {label}")
    print(f"Confidence Score: {confidence:.2f}")
