In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from tqdm import tqdm
import lightgbm as lgb

# Load Dataset
true_data = pd.read_csv('politifact_real.csv')
fake_data = pd.read_csv('politifact_fake.csv')

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
    text = ''.join([char for char in text if char.isalnum() or char in [' ', "'"]])
    return text

true_data['title'] = true_data['title'].apply(preprocess_text)
fake_data['title'] = fake_data['title'].apply(preprocess_text)

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True'] * len(true_data)
fake_data['Target'] = ['Fake'] * len(fake_data)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
fake_news_data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize and encode the dataset
def encode_data(text_list):
    encoded_inputs = tokenizer(text_list, padding=True, truncation=True, max_length=256, return_tensors='pt')
    return encoded_inputs

# Fine-tune the BERT model
def fine_tune_bert(model, data, labels, epochs=4, batch_size=8, device='cpu'):
    model.to(device)
    inputs = encode_data(data)
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))
    dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    for epoch in range(epochs):
        for batch in tqdm(dataloader, desc="Training Epoch {}".format(epoch+1)):
            batch_input_ids, batch_attention_mask, batch_labels = batch
            batch_input_ids, batch_attention_mask, batch_labels = batch_input_ids.to(device), batch_attention_mask.to(device), batch_labels.to(device)
            model.zero_grad()
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

# Prepare data for fine-tuning
labels = fake_news_data['Target'].apply(lambda x: 1 if x == 'True' else 0).values
fine_tune_bert(model, fake_news_data['title'].tolist(), labels, device='cuda' if torch.cuda.is_available() else 'cpu')

# Extract BERT embeddings after fine-tuning
def get_bert_embeddings(data, model, device='cpu'):
    model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(data, desc="Extracting BERT embeddings"):
            encoded_inputs = encode_data([text])
            encoded_inputs = {key: val.to(device) for key, val in encoded_inputs.items()}
            outputs = model.bert(**encoded_inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    return np.concatenate(embeddings, axis=0)

# Get embeddings for the titles
titles = fake_news_data['title'].tolist()
embeddings = get_bert_embeddings(titles, model, device='cuda' if torch.cuda.is_available() else 'cpu')

# Prepare data for LightGBM
X = embeddings
y = labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'feature_fraction': [0.8, 0.9, 1.0]
}

# Initialize LightGBM model
lgbm = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', boosting_type='gbdt')

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=KFold(n_splits=3), scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:", best_params)

# Train the final model with best parameters
best_lgbm = lgb.LGBMClassifier(**best_params)
best_lgbm.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_lgbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')





A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/dwarikanath/.local/lib/pyth

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/dwarikanath/.local/lib/pyth

AttributeError: _ARRAY_API not found

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app

AttributeError: _ARRAY_API not found

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 132/132 [08:10<00:00,  3.72s/it]
Training Epoch 2: 100%|██████████| 132/132 [07:43<00:00,  3.51s/it]
Training Epoch 3: 100%|██████████| 132/132 [07:39<00:00,  3.48s/it]
Training Epoch 4: 100%|██████████| 132/132 [07:39<00:00,  3.48s/it]
Extracting BERT embeddings: 100%|██████████| 1056/1056 [01:19<00:00, 13.20it/s]

Fitting 3 folds for each of 81 candidates, totalling 243 fits




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/pro




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/dwarikanath/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/proc

[LightGBM] [Info] Number of positive: 324, number of negative: 239
[LightGBM] [Info] Number of positive: 330, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144373
[LightGBM] [Info] Total Bins 144382
[LightGBM] [Info] Number of positive: 340, number of negative: 222
[LightGBM] [Info] Number of positive: 340, number of negative: 222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [

In [12]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load datasets
real_df = pd.read_csv('politifact_real.csv')  # Replace with actual path
fake_df = pd.read_csv('politifact_fake.csv')  # Replace with actual path

# Add labels: 1 for real, 0 for fake
real_df['label'] = 1
fake_df['label'] = 0

# Combine datasets
data = pd.concat([real_df, fake_df], ignore_index=True)

# Preprocess and Vectorize
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['title'])  # Using 'title' instead of 'content'
y = data['label']

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Save the trained model and vectorizer as .pkl files
joblib.dump(model, 'trained_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved as .pkl files.")


Model and vectorizer saved as .pkl files.


In [11]:
import pandas as pd

# Load a sample of each dataset to check the column names
real_df = pd.read_csv('politifact_real.csv')
fake_df = pd.read_csv('politifact_fake.csv')

print("Real news dataset columns:", real_df.columns)
print("Fake news dataset columns:", fake_df.columns)


Real news dataset columns: Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')
Fake news dataset columns: Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')
