In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

# Load dataset
file_path = 'archive/bharatfakenewskosh .csv'
df = pd.read_csv(file_path)

# ✅ Keep only useful fields
df = df[['Statement', 'Eng_Trans_Statement', 'News Body', 'Eng_Trans_News_Body', 
         'Language', 'Region', 'Platform', 'Text', 'Video', 'Image', 'Label']]

# ✅ Fill missing values
df.fillna('', inplace=True)

# ✅ Merge original and translated fields
def merge_text(row):
    statement = row['Eng_Trans_Statement'] if row['Eng_Trans_Statement'] else row['Statement']
    news_body = row['Eng_Trans_News_Body'] if row['Eng_Trans_News_Body'] else row['News Body']
    return f"{statement} {news_body}"

df['combined_text'] = df.apply(merge_text, axis=1)

# ✅ Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

df['cleaned_text'] = df['combined_text'].apply(clean_text)

# ✅ Encode binary fields
binary_map = {'yes': 1, 'no': 0}
df['Text'] = df['Text'].map(binary_map)
df['Video'] = df['Video'].map(binary_map)
df['Image'] = df['Image'].map(binary_map)

# ✅ One-hot encode categorical fields
df = pd.get_dummies(df, columns=['Language', 'Region', 'Platform'])

# ✅ Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])


In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
tokens = tokenizer(
    list(df['cleaned_text']),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
labels = torch.tensor(df['Label'].values)

dataset = TensorDataset(input_ids, attention_mask, labels)
train_size = int(0.6 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)

train_loader = DataLoader(train_dataset, batch_size=6, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=6)
test_loader = DataLoader(test_dataset, batch_size=6)

# ✅ Load BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model = bert_model.to(device)

# ✅ Define optimizer and scheduler (1-cycle policy)
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)
num_training_steps = len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# ✅ Train for one epoch
bert_model.train()
for epoch in range(1):
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

# ✅ Save embeddings for LightGBM
bert_model.eval()
embeddings = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        outputs = bert_model.bert(input_ids, attention_mask=attention_mask)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())

X_text = np.concatenate(embeddings, axis=0)
np.save('fin_bert_embeddings.npy', X_text)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [5]:
X_metadata = df[['Text', 'Video', 'Image'] + 
               [col for col in df.columns if col.startswith(('Language_', 'Region_', 'Platform_'))]].values

# ✅ Load saved embeddings
X_text = np.load('bert_embeddings.npy')

# ✅ Combine embeddings with metadata
X = np.concatenate((X_text, X_metadata), axis=1)
y = df['Label'].values


In [6]:
X_text = X_text.reshape((X_text.shape[0], 1, X_text.shape[1]))


In [7]:
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input
from tensorflow.keras.models import Model

input_layer = Input(shape=(X_text.shape[1], X_text.shape[2]))  # (1, hidden_dim)
x = Bidirectional(LSTM(100, return_sequences=False))(input_layer)
output = Dense(1, activation='sigmoid')(x)

lstm_model = Model(inputs=input_layer, outputs=output)
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [8]:

lstm_model.fit(X_text, y, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6021 - loss: 0.6715 - val_accuracy: 0.6162 - val_loss: 0.6667
Epoch 2/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6124 - loss: 0.6663 - val_accuracy: 0.6159 - val_loss: 0.6676
Epoch 3/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6067 - loss: 0.6682 - val_accuracy: 0.6162 - val_loss: 0.6661
Epoch 4/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6085 - loss: 0.6664 - val_accuracy: 0.6166 - val_loss: 0.6690
Epoch 5/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6147 - loss: 0.6631 - val_accuracy: 0.6170 - val_loss: 0.6766
Epoch 6/10
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6117 - loss: 0.6597 - val_accuracy: 0.6174 - val_loss: 0.6708
Epoch 7/10
[1m738/738[0m 

<keras.src.callbacks.history.History at 0x1e196693310>

In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split data BEFORE tuning
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),  # ✅ Fixed
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),         # ✅ Fixed
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
    
    model = lgb.train(
        param,
        train_data,
        num_boost_round=500,  # 🔥 Increased for better early stopping effect
        valid_sets=[train_data, valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=10),  # ✅ Fixed
            lgb.log_evaluation(10)                   # ✅ Cleaner logging
        ]
    )
    
    y_pred = (model.predict(X_valid) > 0.5).astype(int)
    accuracy = accuracy_score(y_valid, y_pred)
    
    return accuracy

# Optimize using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Get best params from tuning
best_params = study.best_params
print("Best Params:", best_params)


In [13]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 150,
    'learning_rate': 0.0065,
    'feature_fraction': 0.1,
    'is_unbalance': True
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)
# Add metric explicitly to avoid confusion
best_params.update({
    'metric': 'binary_logloss',
    'verbose': -1,
    'is_unbalance': True
})
# Train final LightGBM model using callbacks
final_model = lgb.train(
    params=best_params,
    train_set=train_data,
    num_boost_round=500,
    valid_sets=[train_data, valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),  # Early stopping after 10 rounds
        lgb.log_evaluation(period=10)            # Log evaluation every 10 rounds
    ]
)

# ✅ Predict on test set
y_pred = (final_model.predict(X_valid) > 0.5).astype(int)

# ✅ Evaluate accuracy
accuracy = accuracy_score(y_valid, y_pred)
print("Final LightGBM Accuracy:", accuracy)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.662751	valid_1's binary_logloss: 0.672489
[20]	training's binary_logloss: 0.656408	valid_1's binary_logloss: 0.672198
[30]	training's binary_logloss: 0.650175	valid_1's binary_logloss: 0.671885
[40]	training's binary_logloss: 0.644094	valid_1's binary_logloss: 0.671781
[50]	training's binary_logloss: 0.638072	valid_1's binary_logloss: 0.671538
[60]	training's binary_logloss: 0.63205	valid_1's binary_logloss: 0.671294
[70]	training's binary_logloss: 0.626272	valid_1's binary_logloss: 0.671084
[80]	training's binary_logloss: 0.620478	valid_1's binary_logloss: 0.670938
[90]	training's binary_logloss: 0.614792	valid_1's binary_logloss: 0.670705
[100]	training's binary_logloss: 0.609309	valid_1's binary_logloss: 0.670476
[110]	training's binary_logloss: 0.603844	valid_1's binary_logloss: 0.670367
[120]	training's binary_logloss: 0.598455	valid_1's binary_logloss: 0.670311
[130]	training's binary_l

In [None]:
from sklearn.model_selection import train_test_split

# Split embeddings into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.3, random_state=42)

# Reshape to 2D for LightGBM
X_train_2d = X_train.reshape(X_train.shape[0], -1)
X_test_2d = X_test.reshape(X_test.shape[0], -1)

# Create LightGBM datasets with labels
train_data = lgb.Dataset(X_train_2d, label=y_train)
test_data = lgb.Dataset(X_test_2d, label=y_test)
# Clean up best_params to avoid conflicts
best_params = {k: v for k, v in best_params.items() if k not in ['early_stopping_round', 'verbose']}

# Add metric explicitly to avoid confusion
best_params.update({
    'metric': 'binary_logloss',
    'verbose': -1
})

# Train final LightGBM model using callbacks
final_model = lgb.train(
    params=best_params,
    train_set=train_data,
    num_boost_round=500,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),  # Early stopping after 10 rounds
        lgb.log_evaluation(period=10)            # Log evaluation every 10 rounds
    ]
)

# ✅ Predict on test set
y_pred = (final_model.predict(X_test_2d) > 0.5).astype(int)

# ✅ Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Final LightGBM Accuracy:", accuracy)


Training until validation scores don't improve for 10 rounds
[10]	train's binary_logloss: 0.572132	test's binary_logloss: 0.673556
Early stopping, best iteration is:
[5]	train's binary_logloss: 0.617586	test's binary_logloss: 0.671999
Final LightGBM Accuracy: 0.6015247776365946


In [2]:
import lightgbm as lgb
best_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.09553507428036068,
    'num_leaves': 143,
    'feature_fraction': 0.4505259438267628,
    'max_depth': -1,
    'device': 'gpu'
}

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=500,  # More iterations
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(10)]
)

y_pred = (final_model.predict(X_test_2d) > 0.5).astype(int)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Final LightGBM Accuracy:", accuracy)


NameError: name 'train_data' is not defined