In [1]:
#Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
!pip install transformers "datasets[torch]" accelerate



In [3]:
# Load Training Data
train_df = pd.read_csv("../dataset/training_data.csv", sep="\t", header=None, names=["label", "headline"])
X = train_df["headline"]
y = train_df["label"]
print(train_df.head())
print(train_df.columns)

   label                                           headline
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
Index(['label', 'headline'], dtype='object')


In [4]:
# Train/Validation Split (to estimate accuracy)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Text Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [6]:
# Train Classifier (Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

In [7]:
# Evaluate on validation set
y_val_pred = clf.predict(X_val_tfidf)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_acc:.2f}")
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.93
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      3529
           1       0.92      0.94      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831



In [8]:
# Predict on Testing Data
test_df = pd.read_csv("../dataset/testing_data.csv", sep="\t", header=None, names=["label", "headline"])
X_test = test_df["headline"]
X_test_tfidf = vectorizer.transform(X_test)
y_test_pred = clf.predict(X_test_tfidf)

In [9]:
# Replace '2' in the label column with predictions
test_df["label"] = y_test_pred

In [10]:
# Save Predictions
test_df.to_csv("testing_predictions.csv", index=False)
print("Predictions saved to testing_predictions.csv")

Predictions saved to testing_predictions.csv


In [11]:
# 3. Initialize the pre-trained model pipeline
from transformers import pipeline
classifier = pipeline("text-classification", model="jy46604790/Fake-News-Bert-Detect", truncation=True)

# 4. Run predictions ONLY on the validation headlines
print("Running predictions on the validation set...")
preds = []
for text in X_val:
    result = classifier(text)[0]
    label = 1 if result['label'] == 'LABEL_1' else 0
    preds.append(label)

# 5. Calculate the validation accuracy for the pre-trained model
acc = accuracy_score(y_val, preds)
print(f"\nValidation Accuracy for Pre-trained BERT Model: {acc:.4f}")

Device set to use cpu


Running predictions on the validation set...

Validation Accuracy for Pre-trained BERT Model: 0.6532


In [12]:
# Exploring more Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}

# Ensemble (VotingClassifier: majority vote)
ensemble = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
    ],
    voting='hard'
)

models["Ensemble"] = ensemble

In [13]:
# Train & Evaluate Each Model

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_val_pred = model.predict(X_val_tfidf)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {acc:.2f}")
    print("Classification Report:\n", classification_report(y_val, y_val_pred))

Logistic Regression Validation Accuracy: 0.93
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      3529
           1       0.92      0.94      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831

Naive Bayes Validation Accuracy: 0.92
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.93      3529
           1       0.93      0.92      0.92      3302

    accuracy                           0.92      6831
   macro avg       0.92      0.92      0.92      6831
weighted avg       0.92      0.92      0.92      6831

Random Forest Validation Accuracy: 0.92
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92      3529
           1       0.91      0.92      0.

In [14]:
best_model = models["Ensemble"]  # changed after comparing above validation accuracy
best_model.fit(vectorizer.transform(X), y)  # retrain on full training set

In [15]:
X_test = test_df["headline"]
X_test_tfidf = vectorizer.transform(X_test)
y_test_pred = best_model.predict(X_test_tfidf)

# Replace labels with predictions
test_df["label"] = y_test_pred

# Save predictions (keep tab format, no header)
test_df.to_csv("testing_predictions.csv", index=False)
print("Predictions saved to testing_predictions_v2.csv")

Predictions saved to testing_predictions_v2.csv


In [16]:
from transformers import pipeline
MODEL = "jy46604790/Fake-News-Bert-Detect"
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL)

Device set to use cpu


In [17]:
# 1. Exploring pretrained models with Transformer pipeline
df = pd.read_csv("../dataset/testing_predictions.csv", sep=",", header=None, names=["label", "headline"])

df.head(20)

Unnamed: 0,label,headline
0,label,headline
1,0,copycat muslim terrorist arrested with assault...
2,0,wow! chicago protester caught on camera admits...
3,0,germany's fdp look to fill schaeuble's big shoes
4,0,mi school sends welcome back packet warning ki...
5,1,u.n. seeks 'massive' aid boost amid rohingya '...
6,0,did oprah just leave ‚nasty‚ hillary wishing s...
7,1,france's macron says his job not 'cool' cites ...
8,0,flashback: chilling ‚60 minutes‚ interview wit...
9,1,spanish foreign ministry says to expel north k...


In [18]:
#df = df.drop(0)
df.head(20)

Unnamed: 0,label,headline
0,label,headline
1,0,copycat muslim terrorist arrested with assault...
2,0,wow! chicago protester caught on camera admits...
3,0,germany's fdp look to fill schaeuble's big shoes
4,0,mi school sends welcome back packet warning ki...
5,1,u.n. seeks 'massive' aid boost amid rohingya '...
6,0,did oprah just leave ‚nasty‚ hillary wishing s...
7,1,france's macron says his job not 'cool' cites ...
8,0,flashback: chilling ‚60 minutes‚ interview wit...
9,1,spanish foreign ministry says to expel north k...


In [None]:
# 2. Use a transformer pipeline (zero-shot or fine-tuned for sequence classification)
classifier = pipeline("text-classification", model="jy46604790/Fake-News-Bert-Detect", truncation=True)

# 3. Run predictions
preds = []
for text in df["headline"]:
    result = classifier(text)[0]
    label = 1 if result['label'] == 1 else 0
    preds.append(label)

# 4. Accuracy
acc = accuracy_score(test_df["label"], preds)
print(f"Accuracy: {acc:.2f}")

In [None]:
#df_train = pd.read_csv("training_data.csv", sep="\t", header=None, names=["label", "headline"])


In [None]:
! pip install --upgrade --force-reinstall transformers accelerate "datasets[torch]"

In [None]:
import pandas as pd
from datasets import Dataset

# 1. Load your data with pandas
train_df = pd.read_csv("training_data.csv", sep="\t", header=None, names=["label", "headline"])

# 2. Split your data
# We create full dataframes for easy conversion to Datasets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

# 3. Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# 4. Load the correct tokenizer
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 5. Define the tokenization function
# Make sure the key matches your column name ('headline')
def tokenize_function(examples):
    return tokenizer(examples['headline'], padding="max_length", truncation=True, max_length=128)

# 6. Apply the tokenizer using the correct .map() method
# Now this will work because train_dataset is a Dataset object
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# You can now proceed to load your model and use the Trainer API with
# tokenized_train and tokenized_val.
print("Tokenization successful!")
print(tokenized_train)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("jy46604790/Fake-News-Bert-Detect", num_labels=2)
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# 4. EVALUATE THE FINE-TUNED MODEL
print("\n Evaluating the new fine-tuned model")
eval_results = trainer.evaluate()
accuracy_after = eval_results['eval_accuracy']

# 5. FINAL COMPARISON

print("\n--- Performance Comparison ---")
print(f"Accuracy BEFORE Fine-Tuning: {accuracy_before:.2f}")
print(f"Accuracy AFTER Fine-Tuning:  {accuracy_after:.2f}")