<a href="https://colab.research.google.com/github/2303a51060Nirnaya/PythonForDataScience/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supervised model


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Supervised learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# Load dataset
df = pd.read_csv("/content/fake_news_dataset.csv")

# Combine title + text for better feature representation
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

# Features (X) and Labels (y)
X = df["content"]
y = df["label"].map({"real": 1, "fake": 0})  # Encode labels

# Drop rows with NaN values in the 'label' column
nan_rows = y.isna()
X = X[~nan_rows]
y = y[~nan_rows]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
}

# Train and evaluate models
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))


===== Logistic Regression =====
              precision    recall  f1-score   support

        Fake       0.51      0.52      0.51      2011
        Real       0.50      0.48      0.49      1989

    accuracy                           0.50      4000
   macro avg       0.50      0.50      0.50      4000
weighted avg       0.50      0.50      0.50      4000


===== Decision Tree =====
              precision    recall  f1-score   support

        Fake       0.49      0.51      0.50      2011
        Real       0.48      0.46      0.47      1989

    accuracy                           0.49      4000
   macro avg       0.49      0.49      0.49      4000
weighted avg       0.49      0.49      0.49      4000


===== Random Forest =====
              precision    recall  f1-score   support

        Fake       0.50      0.56      0.53      2011
        Real       0.49      0.43      0.46      1989

    accuracy                           0.49      4000
   macro avg       0.49      0.49      0.

#unsupervised model


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, silhouette_score

# Unsupervised learning models
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture

# Load dataset
df = pd.read_csv("/content/fake_news_dataset.csv")

# Combine title + text for better feature representation
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

# Features (X) and Labels (y)
X = df["content"]
y = df["label"].map({"real": 1, "fake": 0})  # Encode labels

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Define models
models = {
    "KMeans": KMeans(n_clusters=2, random_state=42),
    "Agglomerative Clustering": AgglomerativeClustering(n_clusters=2),
    "DBSCAN": DBSCAN(eps=0.7, min_samples=5),
    "Gaussian Mixture": GaussianMixture(n_components=2, random_state=42)
}

# Store results
results = []

# Train and evaluate models
for name, model in models.items():
    print(f"\n===== {name} =====")

    if name in ["Agglomerative Clustering", "DBSCAN"]:
        labels = model.fit_predict(X_tfidf.toarray())
    elif name == "Gaussian Mixture":
        model.fit(X_tfidf.toarray())
        labels = model.predict(X_tfidf.toarray())
    else:  # KMeans supports sparse input
        labels = model.fit_predict(X_tfidf)

    # Evaluation metrics
    ari = adjusted_rand_score(y, labels)
    sil = silhouette_score(X_tfidf, labels) if len(set(labels)) > 1 else -1

    print(f"Adjusted Rand Index (vs true labels): {ari:.4f}")
    print(f"Silhouette Score: {sil:.4f}")

    results.append({"Model": name, "ARI": ari, "Silhouette": sil})

# Create summary table
results_df = pd.DataFrame(results)
print("\n===== Summary of Unsupervised Learning Results =====")
print(results_df)

#Deep learning model


In [None]:
"""Deep Learning models for Fake News Classification
Colab-ready script. Implements multiple deep learning approaches:
 1) Dense Neural Network on TF-IDF features
 2) 1D CNN on tokenized sequences
 3) Bidirectional LSTM on tokenized sequences
 4) (Optional) Transformer using Hugging Face (commented; requires internet/GPU)

Outputs classification reports for each model.
"""

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Dropout, Conv1D, GlobalMaxPooling1D,
                                     Embedding, LSTM, Bidirectional, Input,
                                     GlobalAveragePooling1D, SpatialDropout1D)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# --------- Config ---------
DATA_PATH = '/content/fake_news_dataset.csv'
RANDOM_STATE = 42
TEST_SIZE = 0.2
MAX_VOCAB = 30000
MAX_LEN = 200  # max tokens for sequences
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 6
TFIDF_MAX_FEATURES = 5000

# GPU check
print('TensorFlow version:', tf._version_)
print('GPU available:', tf.config.list_physical_devices('GPU'))

# --------- Load data ---------
df = pd.read_csv(DATA_PATH)
df['content'] = df['title'].astype(str) + ' ' + df['text'].astype(str)
texts = df['content'].astype(str).values
labels = df['label'].astype(str).values

# Encode labels (fake=0, real=1)
le = LabelEncoder()
y = le.fit_transform(labels)

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# -------------------------
# Model 1: Dense on TF-IDF
# -------------------------
print('\nPreparing TF-IDF features...')
vectorizer = TfidfVectorizer(stop_words='english', max_features=TFIDF_MAX_FEATURES)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Convert to dense if necessary (Keras Dense expects dense)
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

from tensorflow.keras import regularizers

def build_dense_tfidf(input_dim):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,), kernel_regularizer=regularizers.l2(1e-4)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print('\nBuilding and training Dense (TF-IDF) model...')
dense_model = build_dense_tfidf(X_train_tfidf_dense.shape[1])
dense_ckpt = 'dense_tfidf_best.h5'
cb_dense = [EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
            ModelCheckpoint(dense_ckpt, save_best_only=True, monitor='val_loss')]

dense_model.fit(X_train_tfidf_dense, y_train, validation_split=0.1, epochs=EPOCHS,
                batch_size=BATCH_SIZE, callbacks=cb_dense, verbose=2)

y_pred_dense = (dense_model.predict(X_test_tfidf_dense) > 0.5).astype(int).flatten()
print('\nDense (TF-IDF) Classification Report:')
print(classification_report(y_test, y_pred_dense, target_names=le.classes_))

# --------------------------------
# Text tokenization for sequence models
# --------------------------------
print('\nTokenizing texts for sequence models...')
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

vocab_size = min(MAX_VOCAB, len(tokenizer.word_index) + 1)
print('Vocab size used:', vocab_size)

# -------------------------
# Model 2: 1D CNN
# -------------------------

def build_cnn(vocab_size, embed_dim=EMBEDDING_DIM, input_length=MAX_LEN):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print('\nBuilding and training 1D CNN model...')
cnn_model = build_cnn(vocab_size)
cnn_ckpt = 'cnn_best.h5'
cb_cnn = [EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
          ModelCheckpoint(cnn_ckpt, save_best_only=True, monitor='val_loss')]

cnn_model.fit(X_train_pad, y_train, validation_split=0.1, epochs=EPOCHS,
              batch_size=BATCH_SIZE, callbacks=cb_cnn, verbose=2)

y_pred_cnn = (cnn_model.predict(X_test_pad) > 0.5).astype(int).flatten()
print('\nCNN Classification Report:')
print(classification_report(y_test, y_pred_cnn, target_names=le.classes_))

# -------------------------
# Model 3: Bidirectional LSTM
# -------------------------

def build_bilstm(vocab_size, embed_dim=EMBEDDING_DIM, input_length=MAX_LEN):
    inp = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=input_length)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=False))(x)
    x = Dropout(0.4)(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print('\nBuilding and training Bidirectional LSTM model...')
lstm_model = build_bilstm(vocab_size)
lstm_ckpt = 'bilstm_best.h5'
cb_lstm = [EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
          ModelCheckpoint(lstm_ckpt, save_best_only=True, monitor='val_loss')]

lstm_model.fit(X_train_pad, y_train, validation_split=0.1, epochs=EPOCHS,
               batch_size=BATCH_SIZE, callbacks=cb_lstm, verbose=2)

y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype(int).flatten()
print('\nBiLSTM Classification Report:')
print(classification_report(y_test, y_pred_lstm, target_names=le.classes_))

# -------------------------
# Optional: Transformer (Hugging Face)
# -------------------------
# NOTE: This section is optional and commented out because it requires internet
# to download pretrained models and a GPU for practical training.
# Uncomment and run in Colab if you have internet access and want to use BERT/RoBERTa.

# # !pip install transformers datasets
# from transformers import TFAutoModel, AutoTokenizer
# PRETRAINED = 'distilbert-base-uncased'
# tokenizer_hf = AutoTokenizer.from_pretrained(PRETRAINED)
# def encode_texts(texts, tokenizer, max_len=MAX_LEN):
#     enc = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='np')
#     return enc['input_ids'], enc['attention_mask']
#
# input_ids_train, att_mask_train = encode_texts(X_train_text, tokenizer_hf, max_len=128)
# input_ids_test, att_mask_test = encode_texts(X_test_text, tokenizer_hf, max_len=128)
#
# bert_model = TFAutoModel.from_pretrained(PRETRAINED)
# input_ids = Input(shape=(128,), dtype='int32')
# attention_mask = Input(shape=(128,), dtype='int32')
# hidden = bert_model(input_ids, attention_mask=attention_mask)[0]
# pooled = GlobalAveragePooling1D()(hidden)
# out = Dense(1, activation='sigmoid')(pooled)
# model_bert = Model(inputs=[input_ids, attention_mask], outputs=out)
# model_bert.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(2e-5), metrics=['accuracy'])
#
# model_bert.fit([input_ids_train, att_mask_train], y_train, validation_split=0.1, epochs=3, batch_size=16)
# y_pred_bert = (model_bert.predict([input_ids_test, att_mask_test]) > 0.5).astype(int).flatten()
# print('\nTransformer (pretrained) Classification Report:')
# print(classification_report(y_test, y_pred_bert, target_names=le.classes_))

# -------------------------
# Summary
# -------------------------
print('\n==== Summary of Deep Learning Models ====')
summary = []
summary.append({'Model':'Dense (TF-IDF)', 'Accuracy': accuracy_score(y_test, y_pred_dense)})
summary.append({'Model':'1D CNN', 'Accuracy': accuracy_score(y_test, y_pred_cnn)})
summary.append({'Model':'BiLSTM', 'Accuracy': accuracy_score(y_test, y_pred_lstm)})

summary_df = pd.DataFrame(summary).sort_values(by='Accuracy', ascending=False)
print(summary_df)

# Save tokenizer and vectorizer for later use
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print('\nSaved tokenizer.pkl and tfidf_vectorizer.pkl to current directory.')

#Reinforcement Learning

#

In [None]:
# Reinforcement Learning (Contextual Bandits) for Fake News Detection
# -----------------------------------------------------------------
# Problem framing: We treat each news article as a "context" (features from text),
# and the agent chooses one of two actions: predict REAL (1) or FAKE (0).
# Reward = 1 if the action matches the ground-truth label, else 0.
# We compare several RL-style bandit policies:
#  - LinUCB (contextual UCB)
#  - Linear Thompson Sampling (contextual TS)
#  - UCB1 (non-contextual baseline)
# We also report cumulative reward and final test accuracy of the learned policies.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, classification_report

# ------------------------
# Load & featurize dataset
# ------------------------

df = pd.read_csv("/content/fake_news_dataset.csv")
# Combine title + text
texts = (df["title"].astype(str) + " " + df["text"].astype(str)).tolist()
labels = df["label"].map({"real": 1, "fake": 0}).values  # actions: 1=REAL, 0=FAKE

# Train/test split for offline evaluation of learned policies
X_train_txt, X_test_txt, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# TF-IDF -> SVD (dense, low-d)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
svd = TruncatedSVD(n_components=100, random_state=42)

X_train_tfidf = vectorizer.fit_transform(X_train_txt)
X_test_tfidf = vectorizer.transform(X_test_txt)

X_train = svd.fit_transform(X_train_tfidf)  # shape: (n_train, d)
X_test = svd.transform(X_test_tfidf)        # shape: (n_test, d)

d = X_train.shape[1]
ACTIONS = [0, 1]  # 0=FAKE, 1=REAL

# -------------------------------------------------
# Contextual Bandit: LinUCB and Linear Thompson Sam
# -------------------------------------------------

class LinUCB:
    def _init_(self, d, n_actions=2, alpha=0.25):
        self.alpha = alpha
        self.n_actions = n_actions
        # For each arm a: A[a] (dxd), b[a] (dx1)
        self.A = [np.eye(d) for _ in range(n_actions)]
        self.b = [np.zeros(d) for _ in range(n_actions)]

    def theta(self, a):
        A_inv = np.linalg.inv(self.A[a])
        return A_inv @ self.b[a]

    def select(self, x):
        # UCB score: x^T theta_a + alpha * sqrt(x^T A_a^{-1} x)
        x = x.reshape(-1)
        best_a, best_score = None, -1e18
        for a in range(self.n_actions):
            A_inv = np.linalg.inv(self.A[a])
            theta_a = A_inv @ self.b[a]
            mean = x @ theta_a
            conf = self.alpha * np.sqrt(x @ A_inv @ x)
            score = mean + conf
            if score > best_score:
                best_score, best_a = score, a
        return best_a

    def update(self, x, a, r):
        x = x.reshape(-1)
        self.A[a] += np.outer(x, x)
        self.b[a] += r * x

class LinearThompsonSampling:
    def _init_(self, d, n_actions=2, v=1.0, seed=42):
        self.rng = np.random.default_rng(seed)
        self.v = v
        self.n_actions = n_actions
        self.A = [np.eye(d) for _ in range(n_actions)]
        self.b = [np.zeros(d) for _ in range(n_actions)]

    def sample_theta(self, a):
        A_inv = np.linalg.inv(self.A[a])
        mu = A_inv @ self.b[a]
        cov = (self.v ** 2) * A_inv
        return self.rng.multivariate_normal(mu, cov)

    def select(self, x):
        x = x.reshape(-1)
        best_a, best_val = None, -1e18
        for a in range(self.n_actions):
            theta_tilde = self.sample_theta(a)
            val = x @ theta_tilde
            if val > best_val:
                best_val, best_a = val, a
        return best_a

    def update(self, x, a, r):
        x = x.reshape(-1)
        self.A[a] += np.outer(x, x)
        self.b[a] += r * x

class UCB1:
    # Non-contextual baseline that ignores x; learns average reward per arm.
    def _init_(self, n_actions=2):
        self.n_actions = n_actions
        self.counts = np.zeros(n_actions, dtype=int)
        self.rewards = np.zeros(n_actions, dtype=float)
        self.t = 0

    def select(self, x):  # x unused
        self.t += 1
        # Play each arm once first
        for a in range(self.n_actions):
            if self.counts[a] == 0:
                return a
        # UCB1
        avg = self.rewards / self.counts
        bonus = np.sqrt(2 * np.log(self.t) / self.counts)
        return int(np.argmax(avg + bonus))

    def update(self, x, a, r):
        self.counts[a] += 1
        self.rewards[a] += r

# ------------------------------
# Online training (bandit loop)
# ------------------------------

# Shuffle training data for online simulation
perm = np.random.RandomState(42).permutation(len(X_train))
X_online = X_train[perm]
y_online = y_train[perm]

agents = {
    "LinUCB(alpha=0.25)": LinUCB(d=d, alpha=0.25),
    "LinearTS(v=1.0)": LinearThompsonSampling(d=d, v=1.0),
    "UCB1 (no context)": UCB1(),
}

history = {name: {"rewards": [], "actions": [], "correct": []} for name in agents}

for t in range(len(X_online)):
    x = X_online[t]
    y_true = y_online[t]
    for name, agent in agents.items():
        a = agent.select(x)
        r = 1.0 if a == y_true else 0.0
        agent.update(x, a, r)
        history[name]["rewards"].append(r)
        history[name]["actions"].append(a)
        history[name]["correct"].append(int(r))

# ------------------------------
# Evaluate learned policies
# ------------------------------

def policy_from_lin_agent(agent):
    # Build deterministic greedy policy from learned theta for each arm
    thetas = [np.linalg.inv(agent.A[a]) @ agent.b[a] for a in range(agent.n_actions)]
    thetas = np.stack(thetas)  # shape: (n_actions, d)
    def policy(X):
        # choose argmax_a x^T theta_a
        scores = X @ thetas.T  # (n_samples, n_actions)
        return np.argmax(scores, axis=1)
    return policy

# Build policies (LinUCB & LinearTS); UCB1 has no context so predict majority arm
policies = {}
for name, agent in agents.items():
    if isinstance(agent, (LinUCB, LinearThompsonSampling)):
        policies[name] = policy_from_lin_agent(agent)
    else:
        # Predict the arm with higher empirical reward
        maj_arm = int(np.argmax(agent.rewards / np.maximum(agent.counts, 1)))
        policies[name] = lambda X, arm=maj_arm: np.full(X.shape[0], arm, dtype=int)

results = []
for name, policy in policies.items():
    y_pred = policy(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({
        "Agent": name,
        "Cumulative Reward (train)": int(np.sum(history[name]["rewards"])),
        "Train Accuracy (online)": float(np.mean(history[name]["correct"])) ,
        "Test Accuracy": acc
    })
    print(f"\n===== {name} =====")
    print(f"Cumulative Reward (train): {int(np.sum(history[name]['rewards']))}")
    print(f"Train Accuracy (online): {np.mean(history[name]['correct']):.4f}")
    print(f"Test Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))

results_df = pd.DataFrame(results)
print("\n==== Summary ====")
print(results_df)


